|
249 | 249 | "Where ", |
250 | 250 | "Within ", |
251 | 251 | ) |
| 252 | +_ID_RULE_LIST_PAGES: List[Dict[str, str]] = [ |
| 253 | + { |
| 254 | + "title_name": "Idaho Rules of Civil Procedure", |
| 255 | + "url": "https://isc.idaho.gov/ircp-new", |
| 256 | + "procedure_family": "civil_procedure", |
| 257 | + "legal_area": "civil_procedure", |
| 258 | + "official_cite_prefix": "I.R.C.P.", |
| 259 | + }, |
| 260 | + { |
| 261 | + "title_name": "Idaho Criminal Rules", |
| 262 | + "url": "https://isc.idaho.gov/icr", |
| 263 | + "procedure_family": "criminal_procedure", |
| 264 | + "legal_area": "criminal_procedure", |
| 265 | + "official_cite_prefix": "I.C.R.", |
| 266 | + }, |
| 267 | +] |
| 268 | +_ID_RULE_LINK_RE = re.compile(r"^Rule\s+(\d+(?:\.\d+)?)\.\s+(.+)$", re.IGNORECASE) |
| 269 | +_ID_EFFECTIVE_DATE_RE = re.compile(r"effective\s+([A-Za-z]+\s+\d{1,2},\s+\d{4})", re.IGNORECASE) |
252 | 270 |
|
253 | 271 | _CIVIL_PATTERNS = [ |
254 | 272 | re.compile(r"rules?\s+of\s+civil\s+procedure", re.IGNORECASE), |
@@ -420,6 +438,17 @@ async def scrape_code(self, code_name: str, code_url: str) -> List[NormalizedSta |
420 | 438 | return [] |
421 | 439 |
|
422 | 440 |
|
| 441 | +class _IdahoProcedureRulesSupplementFetcher(BaseStateScraper): |
| 442 | + def get_base_url(self) -> str: |
| 443 | + return "https://isc.idaho.gov" |
| 444 | + |
| 445 | + def get_code_list(self) -> List[Dict[str, str]]: |
| 446 | + return [] |
| 447 | + |
| 448 | + async def scrape_code(self, code_name: str, code_url: str) -> List[NormalizedStatute]: |
| 449 | + return [] |
| 450 | + |
| 451 | + |
423 | 452 | def _extract_rhode_island_rule_links(html_text: str, page_url: str) -> List[Dict[str, str]]: |
424 | 453 | try: |
425 | 454 | from bs4 import BeautifulSoup |
@@ -796,6 +825,127 @@ def flush() -> None: |
796 | 825 | return statutes |
797 | 826 |
|
798 | 827 |
|
| 828 | +def _extract_idaho_rule_links( |
| 829 | + html_text: str, |
| 830 | + *, |
| 831 | + page_url: str, |
| 832 | + procedure_family: str, |
| 833 | + legal_area: str, |
| 834 | + official_cite_prefix: str, |
| 835 | +) -> List[Dict[str, str]]: |
| 836 | + try: |
| 837 | + from bs4 import BeautifulSoup |
| 838 | + except ImportError: |
| 839 | + return [] |
| 840 | + |
| 841 | + soup = BeautifulSoup(html_text or "", "html.parser") |
| 842 | + content = soup.select_one("div.field-name-body") |
| 843 | + if content is None: |
| 844 | + return [] |
| 845 | + |
| 846 | + discovered: List[Dict[str, str]] = [] |
| 847 | + seen = set() |
| 848 | + for anchor in content.select("a[href]"): |
| 849 | + href = str(anchor.get("href") or "").strip() |
| 850 | + parent = anchor.find_parent("p") |
| 851 | + label_source = parent.get_text(" ", strip=True) if parent is not None else anchor.get_text(" ", strip=True) |
| 852 | + label = " ".join(str(label_source or "").split()) |
| 853 | + if not href or "Rule " not in label: |
| 854 | + continue |
| 855 | + match = _ID_RULE_LINK_RE.match(label) |
| 856 | + if not match: |
| 857 | + continue |
| 858 | + absolute_url = urljoin(page_url, href) |
| 859 | + key = absolute_url.lower() |
| 860 | + if key in seen: |
| 861 | + continue |
| 862 | + seen.add(key) |
| 863 | + discovered.append( |
| 864 | + { |
| 865 | + "section_number": match.group(1).strip(), |
| 866 | + "section_name": match.group(2).strip().rstrip("."), |
| 867 | + "url": absolute_url, |
| 868 | + "procedure_family": procedure_family, |
| 869 | + "legal_area": legal_area, |
| 870 | + "official_cite_prefix": official_cite_prefix, |
| 871 | + } |
| 872 | + ) |
| 873 | + |
| 874 | + return discovered |
| 875 | + |
| 876 | + |
| 877 | +def _extract_idaho_rule_from_html( |
| 878 | + html_text: str, |
| 879 | + *, |
| 880 | + rule_url: str, |
| 881 | + title_name: str, |
| 882 | + procedure_family: str, |
| 883 | + legal_area: str, |
| 884 | + official_cite_prefix: str, |
| 885 | +) -> Optional[NormalizedStatute]: |
| 886 | + try: |
| 887 | + from bs4 import BeautifulSoup |
| 888 | + except ImportError: |
| 889 | + return None |
| 890 | + |
| 891 | + soup = BeautifulSoup(html_text or "", "html.parser") |
| 892 | + content = soup.select_one("div.field-name-body div.field-item") |
| 893 | + if content is None: |
| 894 | + return None |
| 895 | + |
| 896 | + text_lines = [] |
| 897 | + for node in content.find_all(["p", "h2", "h3"]): |
| 898 | + text = " ".join(node.get_text(" ", strip=True).split()) |
| 899 | + if not text or text == "\u00a0": |
| 900 | + continue |
| 901 | + if text.startswith("Members of the ") or text == "TERMS OF OFFICE": |
| 902 | + break |
| 903 | + text_lines.append(text) |
| 904 | + |
| 905 | + if not text_lines: |
| 906 | + return None |
| 907 | + |
| 908 | + heading = text_lines[0] |
| 909 | + heading_match = re.search(r"Rule\s+(\d+(?:\.\d+)?)\.\s+(.+)$", heading, re.IGNORECASE) |
| 910 | + if not heading_match: |
| 911 | + return None |
| 912 | + |
| 913 | + section_number = heading_match.group(1).strip() |
| 914 | + section_name = heading_match.group(2).strip().rstrip(".") |
| 915 | + full_text = "\n".join(text_lines).strip() |
| 916 | + if len(full_text) < 40: |
| 917 | + return None |
| 918 | + |
| 919 | + effective_dates = [ |
| 920 | + " ".join(match.group(1).split()) |
| 921 | + for match in _ID_EFFECTIVE_DATE_RE.finditer(full_text) |
| 922 | + if match.group(1) |
| 923 | + ] |
| 924 | + effective_date = effective_dates[-1] if effective_dates else None |
| 925 | + |
| 926 | + return NormalizedStatute( |
| 927 | + state_code="ID", |
| 928 | + state_name=US_STATES["ID"], |
| 929 | + statute_id=f"{official_cite_prefix} {section_number}", |
| 930 | + code_name=title_name, |
| 931 | + title_name=title_name, |
| 932 | + chapter_name=title_name, |
| 933 | + section_number=section_number, |
| 934 | + section_name=section_name, |
| 935 | + short_title=section_name, |
| 936 | + full_text=full_text, |
| 937 | + summary=section_name, |
| 938 | + source_url=f"{rule_url}#rule-{section_number.lower()}", |
| 939 | + official_cite=f"{official_cite_prefix} {section_number}", |
| 940 | + legal_area=legal_area, |
| 941 | + structured_data={ |
| 942 | + "effective_date": effective_date, |
| 943 | + "source_kind": "idaho_supreme_court_rule_page", |
| 944 | + "procedure_family": procedure_family, |
| 945 | + }, |
| 946 | + ) |
| 947 | + |
| 948 | + |
799 | 949 | def _connecticut_procedure_family_for_section(section_number: str) -> Optional[str]: |
800 | 950 | numeric_prefix = int(str(section_number or "0").split("-", 1)[0] or 0) |
801 | 951 | if 11 <= numeric_prefix <= 25: |
@@ -2518,6 +2668,96 @@ async def _scrape_connecticut_court_rules_supplement( |
2518 | 2668 | return supplemental_rules, fetcher.get_fetch_analytics_snapshot() |
2519 | 2669 |
|
2520 | 2670 |
|
| 2671 | +async def _scrape_idaho_court_rules_supplement( |
| 2672 | + *, |
| 2673 | + existing_source_urls: Optional[set[str]] = None, |
| 2674 | + max_rules: Optional[int] = None, |
| 2675 | +) -> tuple[List[Dict[str, Any]], Dict[str, Any]]: |
| 2676 | + fetcher = _IdahoProcedureRulesSupplementFetcher("ID", US_STATES["ID"]) |
| 2677 | + existing_urls = { |
| 2678 | + str(url or "").strip().lower() |
| 2679 | + for url in (existing_source_urls or set()) |
| 2680 | + if str(url or "").strip() |
| 2681 | + } |
| 2682 | + remaining = int(max_rules) if max_rules and int(max_rules) > 0 else None |
| 2683 | + supplemental_rules: List[Dict[str, Any]] = [] |
| 2684 | + |
| 2685 | + for list_page in _ID_RULE_LIST_PAGES: |
| 2686 | + if remaining is not None and remaining <= 0: |
| 2687 | + break |
| 2688 | + |
| 2689 | + list_url = str(list_page["url"]) |
| 2690 | + list_html = await _fetch_html_with_direct_fallback( |
| 2691 | + fetcher, |
| 2692 | + list_url, |
| 2693 | + validator=lambda html: len( |
| 2694 | + _extract_idaho_rule_links( |
| 2695 | + html, |
| 2696 | + page_url=list_url, |
| 2697 | + procedure_family=str(list_page["procedure_family"]), |
| 2698 | + legal_area=str(list_page["legal_area"]), |
| 2699 | + official_cite_prefix=str(list_page["official_cite_prefix"]), |
| 2700 | + ) |
| 2701 | + ) |
| 2702 | + > 0, |
| 2703 | + timeout_seconds=120, |
| 2704 | + ) |
| 2705 | + if not list_html: |
| 2706 | + continue |
| 2707 | + |
| 2708 | + rule_links = _extract_idaho_rule_links( |
| 2709 | + list_html, |
| 2710 | + page_url=list_url, |
| 2711 | + procedure_family=str(list_page["procedure_family"]), |
| 2712 | + legal_area=str(list_page["legal_area"]), |
| 2713 | + official_cite_prefix=str(list_page["official_cite_prefix"]), |
| 2714 | + ) |
| 2715 | + for rule in rule_links: |
| 2716 | + if remaining is not None and remaining <= 0: |
| 2717 | + break |
| 2718 | + |
| 2719 | + rule_url = str(rule["url"]) |
| 2720 | + if rule_url.lower() in existing_urls: |
| 2721 | + continue |
| 2722 | + |
| 2723 | + rule_html = await _fetch_html_with_direct_fallback( |
| 2724 | + fetcher, |
| 2725 | + rule_url, |
| 2726 | + validator=lambda html: _extract_idaho_rule_from_html( |
| 2727 | + html, |
| 2728 | + rule_url=rule_url, |
| 2729 | + title_name=str(list_page["title_name"]), |
| 2730 | + procedure_family=str(rule["procedure_family"]), |
| 2731 | + legal_area=str(rule["legal_area"]), |
| 2732 | + official_cite_prefix=str(rule["official_cite_prefix"]), |
| 2733 | + ) |
| 2734 | + is not None, |
| 2735 | + timeout_seconds=120, |
| 2736 | + ) |
| 2737 | + if not rule_html: |
| 2738 | + continue |
| 2739 | + |
| 2740 | + statute = _extract_idaho_rule_from_html( |
| 2741 | + rule_html, |
| 2742 | + rule_url=rule_url, |
| 2743 | + title_name=str(list_page["title_name"]), |
| 2744 | + procedure_family=str(rule["procedure_family"]), |
| 2745 | + legal_area=str(rule["legal_area"]), |
| 2746 | + official_cite_prefix=str(rule["official_cite_prefix"]), |
| 2747 | + ) |
| 2748 | + if statute is None: |
| 2749 | + continue |
| 2750 | + |
| 2751 | + enriched = fetcher._enrich_statute_structure(statute).to_dict() |
| 2752 | + family = _classify_procedure_family(enriched) or str(rule["procedure_family"]) |
| 2753 | + enriched["procedure_family"] = family |
| 2754 | + supplemental_rules.append(enriched) |
| 2755 | + existing_urls.add(rule_url.lower()) |
| 2756 | + remaining = None if remaining is None else remaining - 1 |
| 2757 | + |
| 2758 | + return supplemental_rules, fetcher.get_fetch_analytics_snapshot() |
| 2759 | + |
| 2760 | + |
2521 | 2761 | def _resolve_output_dir(output_dir: Optional[str] = None) -> Path: |
2522 | 2762 | if output_dir: |
2523 | 2763 | return Path(output_dir).expanduser().resolve() |
@@ -2819,6 +3059,23 @@ async def scrape_state_procedure_rules( |
2819 | 3059 | if ct_fetch_analytics: |
2820 | 3060 | supplemental_fetch_analytics_by_state[state_code] = ct_fetch_analytics |
2821 | 3061 |
|
| 3062 | + if state_code == "ID": |
| 3063 | + remaining_rule_budget = None |
| 3064 | + if max_rules and max_rules > 0: |
| 3065 | + remaining_rule_budget = max(int(max_rules) - len(procedure_statutes), 0) |
| 3066 | + id_supplement, id_fetch_analytics = await _scrape_idaho_court_rules_supplement( |
| 3067 | + existing_source_urls=seen_source_urls, |
| 3068 | + max_rules=remaining_rule_budget, |
| 3069 | + ) |
| 3070 | + if id_supplement: |
| 3071 | + procedure_statutes.extend(id_supplement) |
| 3072 | + for rule in id_supplement: |
| 3073 | + family = str(rule.get("procedure_family") or "").strip() |
| 3074 | + if family: |
| 3075 | + family_counts[family] = int(family_counts.get(family, 0)) + 1 |
| 3076 | + if id_fetch_analytics: |
| 3077 | + supplemental_fetch_analytics_by_state[state_code] = id_fetch_analytics |
| 3078 | + |
2822 | 3079 | if max_rules and max_rules > 0: |
2823 | 3080 | procedure_statutes = procedure_statutes[: int(max_rules)] |
2824 | 3081 |
|
|
0 commit comments