Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ Releases are also tagged in git, if that's helpful.

## Coming up

- Fix `me` Update maine scraper and add backscraper
- Update `sd` backscraper and extract from text

## Current

Expand Down
96 changes: 74 additions & 22 deletions juriscraper/opinions/united_states/state/sd.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,39 @@


class Site(OpinionSiteLinear):
start_year = 1996
# data available in HTML format since 1996, in PDF since 2006
start_year = 2006
# judges full names from https://ujs.sd.gov/Supreme_Court/Justices.aspx
initials_to_judges = {
"MES": "Mark E. Salter",
"SPM": "Scott P. Myren",
"SRJ": "Steven R. Jensen",
"PJD": "Patricia J. DeVaney",
"JMK": "Janine M. Kern",
# not current
"JKM": "Judith Meierhenry", # https://ujs.sd.gov/uploads/sc/opinions/24063.pdf
"DG": "David Gilbertson", # https://ujs.sd.gov/uploads/sc/opinions/23939.pdf
"SLZ": "Steven L. Zinter", # https://ujs.sd.gov/uploads/sc/opinions/24439.pdf
"RWS": "Richard Sabers", # https://ujs.sd.gov/uploads/sc/opinions/24501.pdf
"JKK": "John Konenkamp", # https://ujs.sd.gov/uploads/sc/opinions/24387.pdf
"GAS": "Glen Severson", # https://ujs.sd.gov/uploads/sc/opinions/25115.pdf
"LSW": "Lori Wilbur", # https://ujs.sd.gov/uploads/sc/opinions/25808.pdf
}
disposition_mapper = {
"dismiss": "Dismissed",
"dis": "Dismissed", # https://ujs.sd.gov/uploads/sc/opinions/24312.pdf
"a": "Affirmed",
"r": "Reversed and remanded",
"rev & rem": "Reverse and remanded", # https://ujs.sd.gov/uploads/sc/opinions/24409.pdf
"aff in pt & rev in pt": "Affirmed in part and reversed in part",
"aff in pt, rev in pt & rem": "Affirmed in part, reversed in part and remanded", # https://ujs.sd.gov/uploads/sc/opinions/23919.pdf
"aff in pt, vacate, & rem in pt": "Affirmed in part, vacated and remanded in part",
"aff in pt & vacate": "Affirmed and vacated", # https://www.courtlistener.com/opinion/9502826/state-v-scott/pdf/
}

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.status = "Published"
self.url = "https://ujs.sd.gov/Supreme_Court/opinions.aspx"
self.is_backscrape = False
self.make_backscrape_iterable(kwargs)
Expand All @@ -33,27 +60,28 @@ def _process_html(self) -> None:
:return None
"""
rows = self.html.xpath(
"//div[@id='ContentPlaceHolder1_ChildContent1_UpdatePanel_Opinions']//tbody/tr"
"//div[@id='ContentPlaceHolder1_ChildContent1_UpdatePanel_Opinions']//tbody/tr[not(th)][not(.//a[contains(@id, 'DataList_Paging_LinkButton')])]"
)
for row in rows:
title = get_row_column_text(row, 2)
cite = re.findall(r"\d{4} S\.D\. \d+", title)

status = "Published"
cite = re.findall(r"\d{4} S\.?D\.? \d+", title)
if not cite:
continue
status = "Unpublished"
cite = [""]

# https://ujs.sd.gov/uploads/sc/opinions/2928369ef9a6.pdf
# We abstract out the first part of the docket number here
# And process the full docket number in the `extract_from_text` method
# Called after the file has been downloaded.
url = row.xpath(".//td[2]/a/@href")[0]
docket = url.split("/")[-1][:5]
name = titlecase(title.rsplit(",", 1)[0] if cite else title)
self.cases.append(
{
"date": get_row_column_text(row, 1),
"name": titlecase(title.rsplit(",", 1)[0]),
"name": name,
"citation": cite[0],
"url": url,
"docket": docket,
"status": status,
}
)

Expand Down Expand Up @@ -137,20 +165,44 @@ def make_backscrape_iterable(self, kwargs: Dict) -> None:
def extract_from_text(self, scraped_text: str) -> Dict[str, Any]:
"""Can we extract the date filed from the text?

Some edge cases:
- case with 2 judges https://www.courtlistener.com/opinion/9456271/mcgee-v-spencer-quarries-inc/pdf/
- case without disposition: https://www.courtlistener.com/opinion/10121701/discipline-of-ravnsborg/pdf/
- case without a judge string https://www.courtlistener.com/opinion/9474051/in-the-matter-of-the-interpretation-of-south-dakota-constitution-and-state/pdf/

:param scraped_text: The content of the document downloaded
:return: Metadata to be added to the case
"""
metadata = {}
target_text = scraped_text[:100]

dockets = re.findall(r"(?<=#)\d+", target_text)
if dockets:
metadata["Docket"] = {"docket_number": ", ".join(dockets)}

judge_regex = r"-[A-Z]{2,3}(\s*[,&]\s+[A-Z]{2,3})*"
if judges_match := re.search(judge_regex, target_text):
initials = re.sub(r"[\s,&-]+", " ", judges_match.group(0)).strip()
judges = []
for initial in initials.split(" "):
if judge := self.initials_to_judges.get(initial):
judges.append(judge)
else:
# Catch updates
logger.error(
"Judge initials not mapped to full name %s", initial
)

if judges:
metadata["OpinionCluster"] = {"judges": ", ".join(judges)}

disposition_regex = r"(?<=-)[a-z,&\s]+(?=-)"
if disposition_match := re.search(disposition_regex, target_text):
raw_disposition = disposition_match.group(0)
if disp := self.disposition_mapper.get(raw_disposition):
if metadata.get("OpinionCluster"):
metadata["OpinionCluster"]["disposition"] = disp
else:
metadata["OpinionCluster"] = {"disposition": disp}

# The docket number appears to be the first text on the page.
# So we crop the text to avoid any confusion that might occur in the
# body of an opinion.
docket = re.findall(r"#\d+.*-.-\w{3}", scraped_text[:100])
if not docket:
return {}

metadata = {
"Docket": {
"docket_number": docket[0],
},
}
return metadata
25 changes: 23 additions & 2 deletions tests/local/test_ScraperExtractFromTextTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,29 @@ class ScraperExtractFromText(unittest.TestCase):
],
"juriscraper.opinions.united_states.state.sd": [
(
"""#30018-a-MES\n2023 S.D. 4""",
{"Docket": {"docket_number": "#30018-a-MES"}},
# https://www.courtlistener.com/opinion/9456271/mcgee-v-spencer-quarries-inc/pdf/
"""#29901-aff in pt & rev in pt-PJD & SRJ\n2023 S.D. 66\nIN THE SUPREME COURT""",
{
"Docket": {"docket_number": "29901"},
"OpinionCluster": {
"disposition": "Affirmed in part and reversed in part",
"judges": "Patricia J. DeVaney, Steven R. Jensen",
},
},
"""#30354-SRJ\n2024 S.D. 58\nIN THE SUPREME COURT\nOF THE""",
{
"Docket": {"docket_number": "30354"},
"OpinionCluster": {"judges": "Steven R. Jensen"},
},
# https://www.courtlistener.com/opinion/9406747/estate-of-beadle/?q=court_id%3Asd&page=8
"""#30086, #30094-r-SPM\n2023 S.D. 26\nIN THE SUPREME COURT\nOF THE\nSTATE OF SOUTH DAKOTA""",
{
"Docket": {"docket_number": "30086, 30094"},
"OpinionCluster": {
"judges": "Scott P. Myren",
"disposition": "Reversed and remanded",
},
},
),
],
"juriscraper.opinions.united_states.territories.nmariana": [
Expand Down
Loading