endomorphosis
diff --git a/‎ipfs_datasets_py/processors/legal_scrapers/state_admin_rules_scraper.py‎
Lines changed: 41 additions & 0 deletions b/‎ipfs_datasets_py/processors/legal_scrapers/state_admin_rules_scraper.py‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎ipfs_datasets_py/processors/legal_scrapers/state_procedure_rules_scraper.py‎
Lines changed: 257 additions & 0 deletions b/‎ipfs_datasets_py/processors/legal_scrapers/state_procedure_rules_scraper.py‎
Lines changed: 257 additions & 0 deletions
@@ -10892,6 +10892,19 @@ async def _run_single_state(single_state: str) -> Dict[str, Any]:
         az_fetch_diagnostics: Dict[str, Any] = {}
         allowed_hosts = _allowed_discovery_hosts_for_state(state_code, state_name)
         state_rate_limit_metadata: Dict[str, Any] = {}
+        co_progress_enabled = state_code == "CO"
+
+        def _record_co_progress(phase: str, **details: Any) -> None:
+            if not co_progress_enabled:
+                return
+            detail_parts = []
+            for key, value in details.items():
+                if value is None:
+                    continue
+                detail_parts.append(f"{key}={value}")
+            detail_suffix = f" {' '.join(detail_parts)}" if detail_parts else ""
+            elapsed_s = max(0.0, time.monotonic() - state_start)
+            print(f"co_progress phase={phase} elapsed_s={elapsed_s:.1f}{detail_suffix}", flush=True)
 
         def _init_az_phase(name: str) -> Dict[str, Any]:
             phase = az_fetch_diagnostics.get(name)
@@ -11517,6 +11530,11 @@ def _record_rate_limit_metadata(candidate: Any) -> None:
                 allowed_hosts=allowed_hosts,
                 limit=colorado_bootstrap_limit,
             )
+            _record_co_progress(
+                "bootstrap_done",
+                discovered=len(colorado_bootstrap_document_urls),
+                limit=colorado_bootstrap_limit,
+            )
             for document_url in colorado_bootstrap_document_urls:
                 candidate_urls.append(document_url)
             if colorado_bootstrap_document_urls:
@@ -12172,6 +12190,15 @@ async def _append_document_if_rule(
                     title=section_name,
                     method_value=method_value,
                 )
+            if co_progress_enabled:
+                accepted_count = len(statutes)
+                if accepted_count <= 5 or accepted_count % 25 == 0:
+                    _record_co_progress(
+                        "accepted_rule",
+                        accepted=accepted_count,
+                        source_phase=source_phase or "pending_candidate",
+                        url=doc_url,
+                    )
             return True
 
         prefetch_candidates: List[str] = []
@@ -12807,6 +12834,13 @@ async def _append_document_if_rule(
             ),
             exclude_urls=ranked_direct_exclude_urls,
         )
+        _record_co_progress(
+            "direct_detail_queue_ready",
+            ranked=len(ranked_urls),
+            queued=len(prioritized_ranked_document_urls),
+            seed_docs=len(prioritized_seed_document_urls),
+            direct_ready=int(bool(direct_detail_ready)),
+        )
 
         if state_code == "AZ" and prioritized_ranked_document_urls:
             az_late_retry_urls = _prioritized_arizona_late_retry_urls(
@@ -13112,6 +13146,13 @@ async def _append_document_if_rule(
                         continue
                     seed_expansion_candidates.append((link_url, link_score + 3))
 
+        _record_co_progress(
+            "direct_detail_complete",
+            accepted=len(statutes),
+            inspected=inspected_urls,
+            seed_expansions=len(seed_expansion_candidates),
+        )
+
         if state_code == "AZ" and len(statutes) < max_fetch:
             az_late_retry_urls = _prioritized_arizona_late_retry_urls(
                 prioritized_ranked_document_urls,
 
@@ -249,6 +249,24 @@
     "Where ",
     "Within ",
 )
+_ID_RULE_LIST_PAGES: List[Dict[str, str]] = [
+    {
+        "title_name": "Idaho Rules of Civil Procedure",
+        "url": "https://isc.idaho.gov/ircp-new",
+        "procedure_family": "civil_procedure",
+        "legal_area": "civil_procedure",
+        "official_cite_prefix": "I.R.C.P.",
+    },
+    {
+        "title_name": "Idaho Criminal Rules",
+        "url": "https://isc.idaho.gov/icr",
+        "procedure_family": "criminal_procedure",
+        "legal_area": "criminal_procedure",
+        "official_cite_prefix": "I.C.R.",
+    },
+]
+_ID_RULE_LINK_RE = re.compile(r"^Rule\s+(\d+(?:\.\d+)?)\.\s+(.+)$", re.IGNORECASE)
+_ID_EFFECTIVE_DATE_RE = re.compile(r"effective\s+([A-Za-z]+\s+\d{1,2},\s+\d{4})", re.IGNORECASE)
 
 _CIVIL_PATTERNS = [
     re.compile(r"rules?\s+of\s+civil\s+procedure", re.IGNORECASE),
@@ -420,6 +438,17 @@ async def scrape_code(self, code_name: str, code_url: str) -> List[NormalizedSta
         return []
 
 
+class _IdahoProcedureRulesSupplementFetcher(BaseStateScraper):
+    def get_base_url(self) -> str:
+        return "https://isc.idaho.gov"
+
+    def get_code_list(self) -> List[Dict[str, str]]:
+        return []
+
+    async def scrape_code(self, code_name: str, code_url: str) -> List[NormalizedStatute]:
+        return []
+
+
 def _extract_rhode_island_rule_links(html_text: str, page_url: str) -> List[Dict[str, str]]:
     try:
         from bs4 import BeautifulSoup
@@ -796,6 +825,127 @@ def flush() -> None:
     return statutes
 
 
+def _extract_idaho_rule_links(
+    html_text: str,
+    *,
+    page_url: str,
+    procedure_family: str,
+    legal_area: str,
+    official_cite_prefix: str,
+) -> List[Dict[str, str]]:
+    try:
+        from bs4 import BeautifulSoup
+    except ImportError:
+        return []
+
+    soup = BeautifulSoup(html_text or "", "html.parser")
+    content = soup.select_one("div.field-name-body")
+    if content is None:
+        return []
+
+    discovered: List[Dict[str, str]] = []
+    seen = set()
+    for anchor in content.select("a[href]"):
+        href = str(anchor.get("href") or "").strip()
+        parent = anchor.find_parent("p")
+        label_source = parent.get_text(" ", strip=True) if parent is not None else anchor.get_text(" ", strip=True)
+        label = " ".join(str(label_source or "").split())
+        if not href or "Rule " not in label:
+            continue
+        match = _ID_RULE_LINK_RE.match(label)
+        if not match:
+            continue
+        absolute_url = urljoin(page_url, href)
+        key = absolute_url.lower()
+        if key in seen:
+            continue
+        seen.add(key)
+        discovered.append(
+            {
+                "section_number": match.group(1).strip(),
+                "section_name": match.group(2).strip().rstrip("."),
+                "url": absolute_url,
+                "procedure_family": procedure_family,
+                "legal_area": legal_area,
+                "official_cite_prefix": official_cite_prefix,
+            }
+        )
+
+    return discovered
+
+
+def _extract_idaho_rule_from_html(
+    html_text: str,
+    *,
+    rule_url: str,
+    title_name: str,
+    procedure_family: str,
+    legal_area: str,
+    official_cite_prefix: str,
+) -> Optional[NormalizedStatute]:
+    try:
+        from bs4 import BeautifulSoup
+    except ImportError:
+        return None
+
+    soup = BeautifulSoup(html_text or "", "html.parser")
+    content = soup.select_one("div.field-name-body div.field-item")
+    if content is None:
+        return None
+
+    text_lines = []
+    for node in content.find_all(["p", "h2", "h3"]):
+        text = " ".join(node.get_text(" ", strip=True).split())
+        if not text or text == "\u00a0":
+            continue
+        if text.startswith("Members of the ") or text == "TERMS OF OFFICE":
+            break
+        text_lines.append(text)
+
+    if not text_lines:
+        return None
+
+    heading = text_lines[0]
+    heading_match = re.search(r"Rule\s+(\d+(?:\.\d+)?)\.\s+(.+)$", heading, re.IGNORECASE)
+    if not heading_match:
+        return None
+
+    section_number = heading_match.group(1).strip()
+    section_name = heading_match.group(2).strip().rstrip(".")
+    full_text = "\n".join(text_lines).strip()
+    if len(full_text) < 40:
+        return None
+
+    effective_dates = [
+        " ".join(match.group(1).split())
+        for match in _ID_EFFECTIVE_DATE_RE.finditer(full_text)
+        if match.group(1)
+    ]
+    effective_date = effective_dates[-1] if effective_dates else None
+
+    return NormalizedStatute(
+        state_code="ID",
+        state_name=US_STATES["ID"],
+        statute_id=f"{official_cite_prefix} {section_number}",
+        code_name=title_name,
+        title_name=title_name,
+        chapter_name=title_name,
+        section_number=section_number,
+        section_name=section_name,
+        short_title=section_name,
+        full_text=full_text,
+        summary=section_name,
+        source_url=f"{rule_url}#rule-{section_number.lower()}",
+        official_cite=f"{official_cite_prefix} {section_number}",
+        legal_area=legal_area,
+        structured_data={
+            "effective_date": effective_date,
+            "source_kind": "idaho_supreme_court_rule_page",
+            "procedure_family": procedure_family,
+        },
+    )
+
+
 def _connecticut_procedure_family_for_section(section_number: str) -> Optional[str]:
     numeric_prefix = int(str(section_number or "0").split("-", 1)[0] or 0)
     if 11 <= numeric_prefix <= 25:
@@ -2518,6 +2668,96 @@ async def _scrape_connecticut_court_rules_supplement(
     return supplemental_rules, fetcher.get_fetch_analytics_snapshot()
 
 
+async def _scrape_idaho_court_rules_supplement(
+    *,
+    existing_source_urls: Optional[set[str]] = None,
+    max_rules: Optional[int] = None,
+) -> tuple[List[Dict[str, Any]], Dict[str, Any]]:
+    fetcher = _IdahoProcedureRulesSupplementFetcher("ID", US_STATES["ID"])
+    existing_urls = {
+        str(url or "").strip().lower()
+        for url in (existing_source_urls or set())
+        if str(url or "").strip()
+    }
+    remaining = int(max_rules) if max_rules and int(max_rules) > 0 else None
+    supplemental_rules: List[Dict[str, Any]] = []
+
+    for list_page in _ID_RULE_LIST_PAGES:
+        if remaining is not None and remaining <= 0:
+            break
+
+        list_url = str(list_page["url"])
+        list_html = await _fetch_html_with_direct_fallback(
+            fetcher,
+            list_url,
+            validator=lambda html: len(
+                _extract_idaho_rule_links(
+                    html,
+                    page_url=list_url,
+                    procedure_family=str(list_page["procedure_family"]),
+                    legal_area=str(list_page["legal_area"]),
+                    official_cite_prefix=str(list_page["official_cite_prefix"]),
+                )
+            )
+            > 0,
+            timeout_seconds=120,
+        )
+        if not list_html:
+            continue
+
+        rule_links = _extract_idaho_rule_links(
+            list_html,
+            page_url=list_url,
+            procedure_family=str(list_page["procedure_family"]),
+            legal_area=str(list_page["legal_area"]),
+            official_cite_prefix=str(list_page["official_cite_prefix"]),
+        )
+        for rule in rule_links:
+            if remaining is not None and remaining <= 0:
+                break
+
+            rule_url = str(rule["url"])
+            if rule_url.lower() in existing_urls:
+                continue
+
+            rule_html = await _fetch_html_with_direct_fallback(
+                fetcher,
+                rule_url,
+                validator=lambda html: _extract_idaho_rule_from_html(
+                    html,
+                    rule_url=rule_url,
+                    title_name=str(list_page["title_name"]),
+                    procedure_family=str(rule["procedure_family"]),
+                    legal_area=str(rule["legal_area"]),
+                    official_cite_prefix=str(rule["official_cite_prefix"]),
+                )
+                is not None,
+                timeout_seconds=120,
+            )
+            if not rule_html:
+                continue
+
+            statute = _extract_idaho_rule_from_html(
+                rule_html,
+                rule_url=rule_url,
+                title_name=str(list_page["title_name"]),
+                procedure_family=str(rule["procedure_family"]),
+                legal_area=str(rule["legal_area"]),
+                official_cite_prefix=str(rule["official_cite_prefix"]),
+            )
+            if statute is None:
+                continue
+
+            enriched = fetcher._enrich_statute_structure(statute).to_dict()
+            family = _classify_procedure_family(enriched) or str(rule["procedure_family"])
+            enriched["procedure_family"] = family
+            supplemental_rules.append(enriched)
+            existing_urls.add(rule_url.lower())
+            remaining = None if remaining is None else remaining - 1
+
+    return supplemental_rules, fetcher.get_fetch_analytics_snapshot()
+
+
 def _resolve_output_dir(output_dir: Optional[str] = None) -> Path:
     if output_dir:
         return Path(output_dir).expanduser().resolve()
@@ -2819,6 +3059,23 @@ async def scrape_state_procedure_rules(
                 if ct_fetch_analytics:
                     supplemental_fetch_analytics_by_state[state_code] = ct_fetch_analytics
 
+            if state_code == "ID":
+                remaining_rule_budget = None
+                if max_rules and max_rules > 0:
+                    remaining_rule_budget = max(int(max_rules) - len(procedure_statutes), 0)
+                id_supplement, id_fetch_analytics = await _scrape_idaho_court_rules_supplement(
+                    existing_source_urls=seen_source_urls,
+                    max_rules=remaining_rule_budget,
+                )
+                if id_supplement:
+                    procedure_statutes.extend(id_supplement)
+                    for rule in id_supplement:
+                        family = str(rule.get("procedure_family") or "").strip()
+                        if family:
+                            family_counts[family] = int(family_counts.get(family, 0)) + 1
+                if id_fetch_analytics:
+                    supplemental_fetch_analytics_by_state[state_code] = id_fetch_analytics
+
             if max_rules and max_rules > 0:
                 procedure_statutes = procedure_statutes[: int(max_rules)]