Skip to content

Commit 23f3bcb

Browse files
committed
feat: Add Connecticut procedure rules scraper and enhance tests for extraction and scraping functionality
1 parent 40037a3 commit 23f3bcb

File tree

4 files changed

+711
-0
lines changed

4 files changed

+711
-0
lines changed

ipfs_datasets_py/processors/legal_scrapers/state_admin_rules_scraper.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10892,6 +10892,19 @@ async def _run_single_state(single_state: str) -> Dict[str, Any]:
1089210892
az_fetch_diagnostics: Dict[str, Any] = {}
1089310893
allowed_hosts = _allowed_discovery_hosts_for_state(state_code, state_name)
1089410894
state_rate_limit_metadata: Dict[str, Any] = {}
10895+
co_progress_enabled = state_code == "CO"
10896+
10897+
def _record_co_progress(phase: str, **details: Any) -> None:
10898+
if not co_progress_enabled:
10899+
return
10900+
detail_parts = []
10901+
for key, value in details.items():
10902+
if value is None:
10903+
continue
10904+
detail_parts.append(f"{key}={value}")
10905+
detail_suffix = f" {' '.join(detail_parts)}" if detail_parts else ""
10906+
elapsed_s = max(0.0, time.monotonic() - state_start)
10907+
print(f"co_progress phase={phase} elapsed_s={elapsed_s:.1f}{detail_suffix}", flush=True)
1089510908

1089610909
def _init_az_phase(name: str) -> Dict[str, Any]:
1089710910
phase = az_fetch_diagnostics.get(name)
@@ -11517,6 +11530,11 @@ def _record_rate_limit_metadata(candidate: Any) -> None:
1151711530
allowed_hosts=allowed_hosts,
1151811531
limit=colorado_bootstrap_limit,
1151911532
)
11533+
_record_co_progress(
11534+
"bootstrap_done",
11535+
discovered=len(colorado_bootstrap_document_urls),
11536+
limit=colorado_bootstrap_limit,
11537+
)
1152011538
for document_url in colorado_bootstrap_document_urls:
1152111539
candidate_urls.append(document_url)
1152211540
if colorado_bootstrap_document_urls:
@@ -12172,6 +12190,15 @@ async def _append_document_if_rule(
1217212190
title=section_name,
1217312191
method_value=method_value,
1217412192
)
12193+
if co_progress_enabled:
12194+
accepted_count = len(statutes)
12195+
if accepted_count <= 5 or accepted_count % 25 == 0:
12196+
_record_co_progress(
12197+
"accepted_rule",
12198+
accepted=accepted_count,
12199+
source_phase=source_phase or "pending_candidate",
12200+
url=doc_url,
12201+
)
1217512202
return True
1217612203

1217712204
prefetch_candidates: List[str] = []
@@ -12807,6 +12834,13 @@ async def _append_document_if_rule(
1280712834
),
1280812835
exclude_urls=ranked_direct_exclude_urls,
1280912836
)
12837+
_record_co_progress(
12838+
"direct_detail_queue_ready",
12839+
ranked=len(ranked_urls),
12840+
queued=len(prioritized_ranked_document_urls),
12841+
seed_docs=len(prioritized_seed_document_urls),
12842+
direct_ready=int(bool(direct_detail_ready)),
12843+
)
1281012844

1281112845
if state_code == "AZ" and prioritized_ranked_document_urls:
1281212846
az_late_retry_urls = _prioritized_arizona_late_retry_urls(
@@ -13112,6 +13146,13 @@ async def _append_document_if_rule(
1311213146
continue
1311313147
seed_expansion_candidates.append((link_url, link_score + 3))
1311413148

13149+
_record_co_progress(
13150+
"direct_detail_complete",
13151+
accepted=len(statutes),
13152+
inspected=inspected_urls,
13153+
seed_expansions=len(seed_expansion_candidates),
13154+
)
13155+
1311513156
if state_code == "AZ" and len(statutes) < max_fetch:
1311613157
az_late_retry_urls = _prioritized_arizona_late_retry_urls(
1311713158
prioritized_ranked_document_urls,

ipfs_datasets_py/processors/legal_scrapers/state_procedure_rules_scraper.py

Lines changed: 257 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,24 @@
249249
"Where ",
250250
"Within ",
251251
)
252+
_ID_RULE_LIST_PAGES: List[Dict[str, str]] = [
253+
{
254+
"title_name": "Idaho Rules of Civil Procedure",
255+
"url": "https://isc.idaho.gov/ircp-new",
256+
"procedure_family": "civil_procedure",
257+
"legal_area": "civil_procedure",
258+
"official_cite_prefix": "I.R.C.P.",
259+
},
260+
{
261+
"title_name": "Idaho Criminal Rules",
262+
"url": "https://isc.idaho.gov/icr",
263+
"procedure_family": "criminal_procedure",
264+
"legal_area": "criminal_procedure",
265+
"official_cite_prefix": "I.C.R.",
266+
},
267+
]
268+
_ID_RULE_LINK_RE = re.compile(r"^Rule\s+(\d+(?:\.\d+)?)\.\s+(.+)$", re.IGNORECASE)
269+
_ID_EFFECTIVE_DATE_RE = re.compile(r"effective\s+([A-Za-z]+\s+\d{1,2},\s+\d{4})", re.IGNORECASE)
252270

253271
_CIVIL_PATTERNS = [
254272
re.compile(r"rules?\s+of\s+civil\s+procedure", re.IGNORECASE),
@@ -420,6 +438,17 @@ async def scrape_code(self, code_name: str, code_url: str) -> List[NormalizedSta
420438
return []
421439

422440

441+
class _IdahoProcedureRulesSupplementFetcher(BaseStateScraper):
442+
def get_base_url(self) -> str:
443+
return "https://isc.idaho.gov"
444+
445+
def get_code_list(self) -> List[Dict[str, str]]:
446+
return []
447+
448+
async def scrape_code(self, code_name: str, code_url: str) -> List[NormalizedStatute]:
449+
return []
450+
451+
423452
def _extract_rhode_island_rule_links(html_text: str, page_url: str) -> List[Dict[str, str]]:
424453
try:
425454
from bs4 import BeautifulSoup
@@ -796,6 +825,127 @@ def flush() -> None:
796825
return statutes
797826

798827

828+
def _extract_idaho_rule_links(
829+
html_text: str,
830+
*,
831+
page_url: str,
832+
procedure_family: str,
833+
legal_area: str,
834+
official_cite_prefix: str,
835+
) -> List[Dict[str, str]]:
836+
try:
837+
from bs4 import BeautifulSoup
838+
except ImportError:
839+
return []
840+
841+
soup = BeautifulSoup(html_text or "", "html.parser")
842+
content = soup.select_one("div.field-name-body")
843+
if content is None:
844+
return []
845+
846+
discovered: List[Dict[str, str]] = []
847+
seen = set()
848+
for anchor in content.select("a[href]"):
849+
href = str(anchor.get("href") or "").strip()
850+
parent = anchor.find_parent("p")
851+
label_source = parent.get_text(" ", strip=True) if parent is not None else anchor.get_text(" ", strip=True)
852+
label = " ".join(str(label_source or "").split())
853+
if not href or "Rule " not in label:
854+
continue
855+
match = _ID_RULE_LINK_RE.match(label)
856+
if not match:
857+
continue
858+
absolute_url = urljoin(page_url, href)
859+
key = absolute_url.lower()
860+
if key in seen:
861+
continue
862+
seen.add(key)
863+
discovered.append(
864+
{
865+
"section_number": match.group(1).strip(),
866+
"section_name": match.group(2).strip().rstrip("."),
867+
"url": absolute_url,
868+
"procedure_family": procedure_family,
869+
"legal_area": legal_area,
870+
"official_cite_prefix": official_cite_prefix,
871+
}
872+
)
873+
874+
return discovered
875+
876+
877+
def _extract_idaho_rule_from_html(
878+
html_text: str,
879+
*,
880+
rule_url: str,
881+
title_name: str,
882+
procedure_family: str,
883+
legal_area: str,
884+
official_cite_prefix: str,
885+
) -> Optional[NormalizedStatute]:
886+
try:
887+
from bs4 import BeautifulSoup
888+
except ImportError:
889+
return None
890+
891+
soup = BeautifulSoup(html_text or "", "html.parser")
892+
content = soup.select_one("div.field-name-body div.field-item")
893+
if content is None:
894+
return None
895+
896+
text_lines = []
897+
for node in content.find_all(["p", "h2", "h3"]):
898+
text = " ".join(node.get_text(" ", strip=True).split())
899+
if not text or text == "\u00a0":
900+
continue
901+
if text.startswith("Members of the ") or text == "TERMS OF OFFICE":
902+
break
903+
text_lines.append(text)
904+
905+
if not text_lines:
906+
return None
907+
908+
heading = text_lines[0]
909+
heading_match = re.search(r"Rule\s+(\d+(?:\.\d+)?)\.\s+(.+)$", heading, re.IGNORECASE)
910+
if not heading_match:
911+
return None
912+
913+
section_number = heading_match.group(1).strip()
914+
section_name = heading_match.group(2).strip().rstrip(".")
915+
full_text = "\n".join(text_lines).strip()
916+
if len(full_text) < 40:
917+
return None
918+
919+
effective_dates = [
920+
" ".join(match.group(1).split())
921+
for match in _ID_EFFECTIVE_DATE_RE.finditer(full_text)
922+
if match.group(1)
923+
]
924+
effective_date = effective_dates[-1] if effective_dates else None
925+
926+
return NormalizedStatute(
927+
state_code="ID",
928+
state_name=US_STATES["ID"],
929+
statute_id=f"{official_cite_prefix} {section_number}",
930+
code_name=title_name,
931+
title_name=title_name,
932+
chapter_name=title_name,
933+
section_number=section_number,
934+
section_name=section_name,
935+
short_title=section_name,
936+
full_text=full_text,
937+
summary=section_name,
938+
source_url=f"{rule_url}#rule-{section_number.lower()}",
939+
official_cite=f"{official_cite_prefix} {section_number}",
940+
legal_area=legal_area,
941+
structured_data={
942+
"effective_date": effective_date,
943+
"source_kind": "idaho_supreme_court_rule_page",
944+
"procedure_family": procedure_family,
945+
},
946+
)
947+
948+
799949
def _connecticut_procedure_family_for_section(section_number: str) -> Optional[str]:
800950
numeric_prefix = int(str(section_number or "0").split("-", 1)[0] or 0)
801951
if 11 <= numeric_prefix <= 25:
@@ -2518,6 +2668,96 @@ async def _scrape_connecticut_court_rules_supplement(
25182668
return supplemental_rules, fetcher.get_fetch_analytics_snapshot()
25192669

25202670

2671+
async def _scrape_idaho_court_rules_supplement(
2672+
*,
2673+
existing_source_urls: Optional[set[str]] = None,
2674+
max_rules: Optional[int] = None,
2675+
) -> tuple[List[Dict[str, Any]], Dict[str, Any]]:
2676+
fetcher = _IdahoProcedureRulesSupplementFetcher("ID", US_STATES["ID"])
2677+
existing_urls = {
2678+
str(url or "").strip().lower()
2679+
for url in (existing_source_urls or set())
2680+
if str(url or "").strip()
2681+
}
2682+
remaining = int(max_rules) if max_rules and int(max_rules) > 0 else None
2683+
supplemental_rules: List[Dict[str, Any]] = []
2684+
2685+
for list_page in _ID_RULE_LIST_PAGES:
2686+
if remaining is not None and remaining <= 0:
2687+
break
2688+
2689+
list_url = str(list_page["url"])
2690+
list_html = await _fetch_html_with_direct_fallback(
2691+
fetcher,
2692+
list_url,
2693+
validator=lambda html: len(
2694+
_extract_idaho_rule_links(
2695+
html,
2696+
page_url=list_url,
2697+
procedure_family=str(list_page["procedure_family"]),
2698+
legal_area=str(list_page["legal_area"]),
2699+
official_cite_prefix=str(list_page["official_cite_prefix"]),
2700+
)
2701+
)
2702+
> 0,
2703+
timeout_seconds=120,
2704+
)
2705+
if not list_html:
2706+
continue
2707+
2708+
rule_links = _extract_idaho_rule_links(
2709+
list_html,
2710+
page_url=list_url,
2711+
procedure_family=str(list_page["procedure_family"]),
2712+
legal_area=str(list_page["legal_area"]),
2713+
official_cite_prefix=str(list_page["official_cite_prefix"]),
2714+
)
2715+
for rule in rule_links:
2716+
if remaining is not None and remaining <= 0:
2717+
break
2718+
2719+
rule_url = str(rule["url"])
2720+
if rule_url.lower() in existing_urls:
2721+
continue
2722+
2723+
rule_html = await _fetch_html_with_direct_fallback(
2724+
fetcher,
2725+
rule_url,
2726+
validator=lambda html: _extract_idaho_rule_from_html(
2727+
html,
2728+
rule_url=rule_url,
2729+
title_name=str(list_page["title_name"]),
2730+
procedure_family=str(rule["procedure_family"]),
2731+
legal_area=str(rule["legal_area"]),
2732+
official_cite_prefix=str(rule["official_cite_prefix"]),
2733+
)
2734+
is not None,
2735+
timeout_seconds=120,
2736+
)
2737+
if not rule_html:
2738+
continue
2739+
2740+
statute = _extract_idaho_rule_from_html(
2741+
rule_html,
2742+
rule_url=rule_url,
2743+
title_name=str(list_page["title_name"]),
2744+
procedure_family=str(rule["procedure_family"]),
2745+
legal_area=str(rule["legal_area"]),
2746+
official_cite_prefix=str(rule["official_cite_prefix"]),
2747+
)
2748+
if statute is None:
2749+
continue
2750+
2751+
enriched = fetcher._enrich_statute_structure(statute).to_dict()
2752+
family = _classify_procedure_family(enriched) or str(rule["procedure_family"])
2753+
enriched["procedure_family"] = family
2754+
supplemental_rules.append(enriched)
2755+
existing_urls.add(rule_url.lower())
2756+
remaining = None if remaining is None else remaining - 1
2757+
2758+
return supplemental_rules, fetcher.get_fetch_analytics_snapshot()
2759+
2760+
25212761
def _resolve_output_dir(output_dir: Optional[str] = None) -> Path:
25222762
if output_dir:
25232763
return Path(output_dir).expanduser().resolve()
@@ -2819,6 +3059,23 @@ async def scrape_state_procedure_rules(
28193059
if ct_fetch_analytics:
28203060
supplemental_fetch_analytics_by_state[state_code] = ct_fetch_analytics
28213061

3062+
if state_code == "ID":
3063+
remaining_rule_budget = None
3064+
if max_rules and max_rules > 0:
3065+
remaining_rule_budget = max(int(max_rules) - len(procedure_statutes), 0)
3066+
id_supplement, id_fetch_analytics = await _scrape_idaho_court_rules_supplement(
3067+
existing_source_urls=seen_source_urls,
3068+
max_rules=remaining_rule_budget,
3069+
)
3070+
if id_supplement:
3071+
procedure_statutes.extend(id_supplement)
3072+
for rule in id_supplement:
3073+
family = str(rule.get("procedure_family") or "").strip()
3074+
if family:
3075+
family_counts[family] = int(family_counts.get(family, 0)) + 1
3076+
if id_fetch_analytics:
3077+
supplemental_fetch_analytics_by_state[state_code] = id_fetch_analytics
3078+
28223079
if max_rules and max_rules > 0:
28233080
procedure_statutes = procedure_statutes[: int(max_rules)]
28243081

0 commit comments

Comments
 (0)