diff --git a/CHANGES.md b/CHANGES.md index cb95ce675..e39584f78 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -12,6 +12,8 @@ Releases are also tagged in git, if that's helpful. ## Coming up +- Fix `me` Update maine scraper and add backscraper +- Update `sd` backscraper and extract from text ## Current diff --git a/juriscraper/opinions/united_states/state/sd.py b/juriscraper/opinions/united_states/state/sd.py index cc81adc10..cf52dab7a 100644 --- a/juriscraper/opinions/united_states/state/sd.py +++ b/juriscraper/opinions/united_states/state/sd.py @@ -17,12 +17,39 @@ class Site(OpinionSiteLinear): - start_year = 1996 + # data available in HTML format since 1996, in PDF since 2006 + start_year = 2006 + # judges full names from https://ujs.sd.gov/Supreme_Court/Justices.aspx + initials_to_judges = { + "MES": "Mark E. Salter", + "SPM": "Scott P. Myren", + "SRJ": "Steven R. Jensen", + "PJD": "Patricia J. DeVaney", + "JMK": "Janine M. Kern", + # not current + "JKM": "Judith Meierhenry", # https://ujs.sd.gov/uploads/sc/opinions/24063.pdf + "DG": "David Gilbertson", # https://ujs.sd.gov/uploads/sc/opinions/23939.pdf + "SLZ": "Steven L. Zinter", # https://ujs.sd.gov/uploads/sc/opinions/24439.pdf + "RWS": "Richard Sabers", # https://ujs.sd.gov/uploads/sc/opinions/24501.pdf + "JKK": "John Konenkamp", # https://ujs.sd.gov/uploads/sc/opinions/24387.pdf + "GAS": "Glen Severson", # https://ujs.sd.gov/uploads/sc/opinions/25115.pdf + "LSW": "Lori Wilbur", # https://ujs.sd.gov/uploads/sc/opinions/25808.pdf + } + disposition_mapper = { + "dismiss": "Dismissed", + "dis": "Dismissed", # https://ujs.sd.gov/uploads/sc/opinions/24312.pdf + "a": "Affirmed", + "r": "Reversed and remanded", + "rev & rem": "Reverse and remanded", # https://ujs.sd.gov/uploads/sc/opinions/24409.pdf + "aff in pt & rev in pt": "Affirmed in part and reversed in part", + "aff in pt, rev in pt & rem": "Affirmed in part, reversed in part and remanded", # https://ujs.sd.gov/uploads/sc/opinions/23919.pdf + "aff in pt, vacate, & rem in pt": "Affirmed in part, vacated and remanded in part", + "aff in pt & vacate": "Affirmed and vacated", # https://www.courtlistener.com/opinion/9502826/state-v-scott/pdf/ + } def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.court_id = self.__module__ - self.status = "Published" self.url = "https://ujs.sd.gov/Supreme_Court/opinions.aspx" self.is_backscrape = False self.make_backscrape_iterable(kwargs) @@ -33,27 +60,28 @@ def _process_html(self) -> None: :return None """ rows = self.html.xpath( - "//div[@id='ContentPlaceHolder1_ChildContent1_UpdatePanel_Opinions']//tbody/tr" + "//div[@id='ContentPlaceHolder1_ChildContent1_UpdatePanel_Opinions']//tbody/tr[not(th)][not(.//a[contains(@id, 'DataList_Paging_LinkButton')])]" ) for row in rows: title = get_row_column_text(row, 2) - cite = re.findall(r"\d{4} S\.D\. \d+", title) + + status = "Published" + cite = re.findall(r"\d{4} S\.?D\.? \d+", title) if not cite: - continue + status = "Unpublished" + cite = [""] - # https://ujs.sd.gov/uploads/sc/opinions/2928369ef9a6.pdf - # We abstract out the first part of the docket number here - # And process the full docket number in the `extract_from_text` method - # Called after the file has been downloaded. url = row.xpath(".//td[2]/a/@href")[0] docket = url.split("/")[-1][:5] + name = titlecase(title.rsplit(",", 1)[0] if cite else title) self.cases.append( { "date": get_row_column_text(row, 1), - "name": titlecase(title.rsplit(",", 1)[0]), + "name": name, "citation": cite[0], "url": url, "docket": docket, + "status": status, } ) @@ -137,20 +165,44 @@ def make_backscrape_iterable(self, kwargs: Dict) -> None: def extract_from_text(self, scraped_text: str) -> Dict[str, Any]: """Can we extract the date filed from the text? + Some edge cases: + - case with 2 judges https://www.courtlistener.com/opinion/9456271/mcgee-v-spencer-quarries-inc/pdf/ + - case without disposition: https://www.courtlistener.com/opinion/10121701/discipline-of-ravnsborg/pdf/ + - case without a judge string https://www.courtlistener.com/opinion/9474051/in-the-matter-of-the-interpretation-of-south-dakota-constitution-and-state/pdf/ + :param scraped_text: The content of the document downloaded :return: Metadata to be added to the case """ + metadata = {} + target_text = scraped_text[:100] + + dockets = re.findall(r"(?<=#)\d+", target_text) + if dockets: + metadata["Docket"] = {"docket_number": ", ".join(dockets)} + + judge_regex = r"-[A-Z]{2,3}(\s*[,&]\s+[A-Z]{2,3})*" + if judges_match := re.search(judge_regex, target_text): + initials = re.sub(r"[\s,&-]+", " ", judges_match.group(0)).strip() + judges = [] + for initial in initials.split(" "): + if judge := self.initials_to_judges.get(initial): + judges.append(judge) + else: + # Catch updates + logger.error( + "Judge initials not mapped to full name %s", initial + ) + + if judges: + metadata["OpinionCluster"] = {"judges": ", ".join(judges)} + + disposition_regex = r"(?<=-)[a-z,&\s]+(?=-)" + if disposition_match := re.search(disposition_regex, target_text): + raw_disposition = disposition_match.group(0) + if disp := self.disposition_mapper.get(raw_disposition): + if metadata.get("OpinionCluster"): + metadata["OpinionCluster"]["disposition"] = disp + else: + metadata["OpinionCluster"] = {"disposition": disp} - # The docket number appears to be the first text on the page. - # So we crop the text to avoid any confusion that might occur in the - # body of an opinion. - docket = re.findall(r"#\d+.*-.-\w{3}", scraped_text[:100]) - if not docket: - return {} - - metadata = { - "Docket": { - "docket_number": docket[0], - }, - } return metadata diff --git a/tests/local/test_ScraperExtractFromTextTest.py b/tests/local/test_ScraperExtractFromTextTest.py index a9a599919..e23a55f99 100644 --- a/tests/local/test_ScraperExtractFromTextTest.py +++ b/tests/local/test_ScraperExtractFromTextTest.py @@ -89,8 +89,29 @@ class ScraperExtractFromText(unittest.TestCase): ], "juriscraper.opinions.united_states.state.sd": [ ( - """#30018-a-MES\n2023 S.D. 4""", - {"Docket": {"docket_number": "#30018-a-MES"}}, + # https://www.courtlistener.com/opinion/9456271/mcgee-v-spencer-quarries-inc/pdf/ + """#29901-aff in pt & rev in pt-PJD & SRJ\n2023 S.D. 66\nIN THE SUPREME COURT""", + { + "Docket": {"docket_number": "29901"}, + "OpinionCluster": { + "disposition": "Affirmed in part and reversed in part", + "judges": "Patricia J. DeVaney, Steven R. Jensen", + }, + }, + """#30354-SRJ\n2024 S.D. 58\nIN THE SUPREME COURT\nOF THE""", + { + "Docket": {"docket_number": "30354"}, + "OpinionCluster": {"judges": "Steven R. Jensen"}, + }, + # https://www.courtlistener.com/opinion/9406747/estate-of-beadle/?q=court_id%3Asd&page=8 + """#30086, #30094-r-SPM\n2023 S.D. 26\nIN THE SUPREME COURT\nOF THE\nSTATE OF SOUTH DAKOTA""", + { + "Docket": {"docket_number": "30086, 30094"}, + "OpinionCluster": { + "judges": "Scott P. Myren", + "disposition": "Reversed and remanded", + }, + }, ), ], "juriscraper.opinions.united_states.territories.nmariana": [