|
13 | 13 | 2022-01-06: This scraper is not maintained. Future work to gather this
|
14 | 14 | data should be done by scraping the CourtListener API
|
15 | 15 | https://www.courtlistener.com/api/rest/v3/clusters/?docket__court__id=me
|
| 16 | +
|
| 17 | + 2025-03-31: This scraper has been updated with a backscraper (flooie) |
16 | 18 | """
|
17 | 19 |
|
18 |
| -from lxml import html |
| 20 | +import re |
| 21 | +from datetime import date |
| 22 | + |
| 23 | +from juriscraper.AbstractSite import logger |
| 24 | +from juriscraper.lib.judge_parsers import normalize_judge_names |
| 25 | +from juriscraper.lib.string_utils import convert_date_string, titlecase |
| 26 | +from juriscraper.OpinionSiteLinear import OpinionSiteLinear |
19 | 27 |
|
20 |
| -from juriscraper.lib.string_utils import convert_date_string |
21 |
| -from juriscraper.OpinionSite import OpinionSite |
22 | 28 |
|
| 29 | +class Site(OpinionSiteLinear): |
| 30 | + url_template = ( |
| 31 | + "https://www.courts.maine.gov/courts/sjc/lawcourt/{}/index.html" |
| 32 | + ) |
| 33 | + first_opinion_year = 2017 |
23 | 34 |
|
24 |
| -class Site(OpinionSite): |
25 | 35 | def __init__(self, *args, **kwargs):
|
26 | 36 | super().__init__(*args, **kwargs)
|
27 | 37 | self.court_id = self.__module__
|
28 | 38 | self.url = "https://www.courts.maine.gov/courts/sjc/opinions.html"
|
29 |
| - self.path_root = '//table[contains(.//th[1], "Opinion")]' |
30 |
| - |
31 |
| - def _get_cell_path(self, cell_number: int, subpath: str = "") -> str: |
32 |
| - path = '//table[contains(.//th[1], "Opinion")]//td[%d]' |
33 |
| - return f"{path}/{subpath}" if subpath else path |
34 |
| - |
35 |
| - def _get_download_urls(self): |
36 |
| - path = f"{self.path_root}//td[2]/a[1]/@href" |
37 |
| - return list(self.html.xpath(path)) |
38 |
| - |
39 |
| - def _get_case_names(self): |
40 |
| - case_names = [] |
41 |
| - path = f"{self.path_root}//td[2]/a[1]" |
42 |
| - for e in self.html.xpath(path): |
43 |
| - s = html.tostring(e, method="text", encoding="unicode") |
44 |
| - case_names.append(s) |
45 |
| - return case_names |
46 |
| - |
47 |
| - def _get_case_dates(self): |
48 |
| - dates = [] |
49 |
| - path = f"{self.path_root}//td[3]" |
50 |
| - for cell in self.html.xpath(path): |
51 |
| - date_string = cell.text_content().replace("Aguust", "August") |
52 |
| - dates.append(convert_date_string(date_string)) |
53 |
| - return dates |
54 |
| - |
55 |
| - def _get_precedential_statuses(self): |
56 |
| - return ["Published"] * len(self.case_names) |
57 |
| - |
58 |
| - def _get_citations(self): |
59 |
| - path = f"{self.path_root}//td[1]//text()" |
60 |
| - return list(self.html.xpath(path)) |
| 39 | + self.path_root = '//table[contains(.//th[1], "Opinion")]//tr[td]' |
| 40 | + self.status = "Published" |
| 41 | + self.make_backscrape_iterable(kwargs) |
| 42 | + |
| 43 | + def _process_html(self): |
| 44 | + for row in self.html.xpath(self.path_root): |
| 45 | + cite, name, date = row.xpath("./td") |
| 46 | + |
| 47 | + # handle the one typo |
| 48 | + date_str = date.text_content().replace("Aguust", "August") |
| 49 | + |
| 50 | + case_name = titlecase(name.text_content()) |
| 51 | + if "Revised" in case_name: |
| 52 | + # handle revised opinions case name |
| 53 | + case_name = case_name.split("Revised")[0].strip() |
| 54 | + self.cases.append( |
| 55 | + { |
| 56 | + "citation": cite.text_content(), |
| 57 | + "date": date_str, |
| 58 | + "name": case_name, |
| 59 | + "url": name.xpath(".//a")[0].attrib["href"], |
| 60 | + "docket": "", |
| 61 | + } |
| 62 | + ) |
| 63 | + |
| 64 | + def extract_from_text(self, scraped_text: str) -> dict: |
| 65 | + """Extract out lots of data from Maine |
| 66 | +
|
| 67 | + :param scraped_text: The first page of content |
| 68 | + :return: The dictionary of extracted data |
| 69 | + """ |
| 70 | + pattern = re.compile( |
| 71 | + r"(?P<label>Docket|On Briefs|Decided|Argued|Panel|Reporter of Decisions):\s*(?P<value>[^\n]+)" |
| 72 | + ) |
| 73 | + extracted = {} |
| 74 | + for match in pattern.finditer(scraped_text[:500]): |
| 75 | + label = match.group("label") |
| 76 | + value = match.group("value").strip() |
| 77 | + extracted[label] = value |
| 78 | + |
| 79 | + author = r"(?P<author_str>.*)\n+(\s+)?\[¶1\]" |
| 80 | + m = re.search(author, scraped_text, re.MULTILINE) |
| 81 | + if m: |
| 82 | + if m.group("author_str") == "PER CURIAM": |
| 83 | + per_curiam = True |
| 84 | + author_str = "" |
| 85 | + else: |
| 86 | + per_curiam = False |
| 87 | + author_str = m.group("author_str") |
| 88 | + else: |
| 89 | + per_curiam = False |
| 90 | + author_str = "" |
| 91 | + |
| 92 | + date_argued = extracted.get("On Briefs", "") or extracted.get( |
| 93 | + "Argued", "" |
| 94 | + ) |
| 95 | + date_argued_str = "" |
| 96 | + if date_argued: |
| 97 | + # Format date |
| 98 | + date_argued = convert_date_string(date_argued) |
| 99 | + date_argued_str = date_argued.strftime("%Y-%m-%d") |
| 100 | + |
| 101 | + metadata = { |
| 102 | + "Opinion": { |
| 103 | + "author_str": normalize_judge_names(author_str), |
| 104 | + "per_curiam": per_curiam, |
| 105 | + }, |
| 106 | + "OpinionCluster": { |
| 107 | + "judges": extracted.get("Panel", ""), |
| 108 | + }, |
| 109 | + "Docket": { |
| 110 | + "date_argued": date_argued_str, |
| 111 | + "docket_number": extracted.get("Docket", ""), |
| 112 | + }, |
| 113 | + } |
| 114 | + |
| 115 | + return metadata |
| 116 | + |
| 117 | + def _download_backwards(self, year: int) -> None: |
| 118 | + self.url = self.url_template.format(year) |
| 119 | + logger.info("Backscraping for year %s %s", year, self.url) |
| 120 | + self.html = self._download() |
| 121 | + self._process_html() |
| 122 | + |
| 123 | + def make_backscrape_iterable(self, kwargs: dict): |
| 124 | + if kwargs.get("backscrape_start"): |
| 125 | + start = int(kwargs["backscrape_start"]) |
| 126 | + else: |
| 127 | + start = self.first_opinion_year |
| 128 | + |
| 129 | + if kwargs.get("backscrape_end"): |
| 130 | + end = int(kwargs["backscrape_end"]) |
| 131 | + else: |
| 132 | + end = date.today().year - 1 |
| 133 | + |
| 134 | + if start == end: |
| 135 | + end = start + 1 |
| 136 | + |
| 137 | + self.back_scrape_iterable = range(start, end) |
0 commit comments