Merge pull request #1258 from nasa/issue_1217_polarization

hhlee445 · web-flow · commit dfe37c745c0e · 2025-09-15T16:36:22.000-07:00
issue 1255: issue 1217: fix(DIST-S1): restrict number of granules in baseline
diff --git a/data_subscriber/catalog.py b/data_subscriber/catalog.py
@@ -142,7 +142,10 @@ def mark_download_job_id(self, batch_id, job_id):
             index=self.ES_INDEX_PATTERNS,
             body={
                 "script": {
-                    "source": f"ctx._source.download_job_id = '{job_id}'",
+                    "source": f"ctx._source.download_job_id = params['job_id']",
+                    "params": {
+                        "job_id": str(job_id)
+                    },
                     "lang": "painless"
                 },
                 "query": {
diff --git a/data_subscriber/cmr.py b/data_subscriber/cmr.py
@@ -353,7 +353,7 @@ def response_jsons_to_cmr_granules(collection, response_jsons, convert_results=T
                 provider_datetime = provider_date["Date"]
                 break
         production_datetime = item["umm"].get("DataGranule").get("ProductionDateTime")
-        granules.append({
+        granule = {
             "granule_id": item["umm"].get("GranuleUR"),
             "revision_id": item.get("meta").get("revision-id"),
             "provider": item.get("meta").get("provider-id"),
@@ -378,8 +378,14 @@ def response_jsons_to_cmr_granules(collection, response_jsons, convert_results=T
                 attr.get("Values")[0]
                 for attr in item["umm"].get("AdditionalAttributes")
                 if attr.get("Name") == collection_identifier_map[collection]
-            ) if collection in collection_identifier_map else None
-        })
+            ) if collection in collection_identifier_map else None,
+        }
+        if collection == Collection.RTC_S1_V1:
+            for attr in item["umm"].get("AdditionalAttributes"):
+                if attr.get("Name") == "POLARIZATION":
+                    polarization = attr.get("Values")  # e.g. ["VV", "VH"]
+                    granule.update({"polarization": polarization})
+        granules.append(granule)
 
     return granules
 
diff --git a/data_subscriber/rtc_for_dist/rtc_for_dist_catalog.py b/data_subscriber/rtc_for_dist/rtc_for_dist_catalog.py
@@ -19,5 +19,9 @@ def form_document(self, filename: str, granule: dict, job_id: str, query_dt: dat
 
         # Add http_urls and s3_urls to the document
         m["filtered_urls"] = granule.get("filtered_urls", [])
+        if granule.get("polarization"):
+            m["polarization"] = granule["polarization"]
+
+        m["@timestamp"] = datetime.now()  # needed for opensearch
 
         return m
diff --git a/data_subscriber/rtc_for_dist/rtc_for_dist_query.py b/data_subscriber/rtc_for_dist/rtc_for_dist_query.py
@@ -1,25 +1,26 @@
-from collections import defaultdict
-from datetime import datetime, timedelta
-import dateutil
-from copy import deepcopy
 import asyncio
+import functools
 import json
+import operator
+import re
+from collections import Counter, defaultdict
+from copy import deepcopy
+from datetime import datetime, timedelta
 
-from util.job_submitter import try_submit_mozart_job
-
-from data_subscriber.es_conn_util import get_document_timestamp_min_max
+import dateutil
 
 from data_subscriber.cmr import CMR_TIME_FORMAT, async_query_cmr
-from data_subscriber.url import determine_acquisition_cycle, rtc_for_dist_unique_id
 from data_subscriber.cslc_utils import save_blocked_download_job, parse_r2_product_file_name
-from data_subscriber.query import BaseQuery, get_query_timerange, DateTimeRange
 from data_subscriber.dist_s1_utils import (localize_dist_burst_db, process_dist_burst_db, compute_dist_s1_triggering,
                                            extend_rtc_for_dist_records, build_rtc_native_ids, rtc_granules_by_acq_index,
                                            basic_decorate_granule, add_unique_rtc_granules, get_unique_rtc_id_for_dist,
-                                           parse_k_parameter, decorate_granule, PENDING_TYPE_RTC_FOR_DIST_DOWNLOAD)
+                                           parse_k_parameter, PENDING_TYPE_RTC_FOR_DIST_DOWNLOAD)
+from data_subscriber.es_conn_util import get_document_timestamp_min_max
+from data_subscriber.query import BaseQuery, DateTimeRange
 from data_subscriber.rtc_for_dist.dist_dependency import DistDependency, CMR_RTC_CACHE_INDEX
-
+from rtc_utils import rtc_granule_regex
 from tools.populate_cmr_rtc_cache import populate_cmr_rtc_cache, parse_rtc_granule_metadata
+from util.job_submitter import try_submit_mozart_job
 
 DIST_K_MULT_FACTOR = 2 # TODO: This should be a setting in probably settings.yaml; must be an integer
 EARLIEST_POSSIBLE_RTC_DATE = "2016-01-01T00:00:00Z"
@@ -298,7 +299,7 @@ def retrieve_baseline_granules(self, product_id, downloads, args, k_offsets_and_
                 self.logger.info(f"Retrieving K-1 granules {start_date=} {end_date=} for {product_id=}")
                 self.logger.debug(new_args)
 
-                # Step 1 of 2: This will return dict of acquisition_cycle -> set of granules for only onse that match the burst pattern
+                # Step 1 of 3: This will return dict of acquisition_cycle -> set of granules for only onse that match the burst pattern
                 granules = asyncio.run(async_query_cmr(new_args, self.token, self.cmr, self.settings, query_timerange, datetime.now(), verbose=verbose))
                 for granule in granules:
                     basic_decorate_granule(granule)
@@ -307,29 +308,72 @@ def retrieve_baseline_granules(self, product_id, downloads, args, k_offsets_and_
                 granules = self.unique_latest_granules(granules)
                 granules_map = rtc_granules_by_acq_index(granules)
 
-                # Step 2 of 2 ...Sort that by acquisition_cycle in decreasing order and then pick the first k-1 frames
+                # Step 2 of 3 ...Sort that by acquisition_cycle in decreasing order and then pick the first k-1 frames
                 acq_day_indices = sorted(granules_map.keys(), reverse=True)
+                possible_k_granules = []
                 for acq_day_index in acq_day_indices:
                     granules = granules_map[acq_day_index]
-                    k_granules.extend(granules)
+                    possible_k_granules.extend(granules)
                     k_satisfied += 1
                     self.logger.info(f"{product_id=} {acq_day_index=} satisfies. {k_satisfied=} {k_offset=} {k_count=} {len(granules)=}")
                     if k_satisfied == k_count:
                         break
 
                 counter += 1
 
+                # Step 3 of 3: Only copy over k_count per burst_id from possible_k_granules to k_granules
+                burst_id_to_granules_map = defaultdict(list)
+                for granule in possible_k_granules:
+                    match_product_id = re.match(rtc_granule_regex, granule["granule_id"])
+                    burst_id = match_product_id.group("burst_id")
+                    if len(burst_id_to_granules_map[burst_id]) >= k_count:
+                        continue  # skip any extra baseline granules per burst_id, capping the number to k_count, per k_offset
+
+                    burst_id_to_granules_map[burst_id].append(granule)
+                burst_id_to_granules_map = dict(burst_id_to_granules_map)
+
+                possible_k_granules = functools.reduce(operator.add, burst_id_to_granules_map.values(), [])
+                k_granules.extend(possible_k_granules)
+
+        self.logger.info(f"{len(k_granules)=}")
         return k_granules
 
     def download_job_submission_handler(self, total_granules, query_timerange):
 
         def add_filtered_urls(granule, filtered_urls: list):
             if granule.get("filtered_urls"):
+                polarizations = []
                 for filter_url in granule.get("filtered_urls"):
-                    # Get rid of .h and mask.tif files that aren't used
-                    # NOTE: If we want to enable https downloads in the download worker, we need to change this
-                    if "s3://" in filter_url and (filter_url[-6:] in ["VV.tif", "VH.tif", "HH.tif", "HV.tif"]):
-                        filtered_urls.append(filter_url)
+                    if filter_url.endswith("VV.tif"):
+                        polarizations.append("VVVH")
+                    if filter_url.endswith("HH.tif"):
+                        polarizations.append("HHHV")
+
+                most_common_polarization = Counter(polarizations).most_common(1)
+
+                if most_common_polarization and most_common_polarization[0][0] == "VVVH":
+                    for filter_url in granule.get("filtered_urls"):
+                        # NOTE: If we want to enable https downloads in the download worker, we need to change this
+                        if not filter_url.startswith("s3://"):
+                            continue
+
+                        if any(filter_url.endswith(s) for s in ["VV.tif", "VH.tif"]):
+                            filtered_urls.append(filter_url)
+                elif most_common_polarization and most_common_polarization[0][0] == "HHHV":
+                    for filter_url in granule.get("filtered_urls"):
+                        # NOTE: If we want to enable https downloads in the download worker, we need to change this
+                        if not filter_url.startswith("s3://"):
+                            continue
+
+                        if any(filter_url.endswith(s) for s in ["HH.tif", "HV.tif"]):
+                            filtered_urls.append(filter_url)
+                else:
+                    self.logger.error(f"Unexpected polarization {most_common_polarization=}. Falling back to regular filtering.")
+                    for filter_url in granule.get("filtered_urls"):
+                        # Get rid of .h and mask.tif files that aren't used
+                        # NOTE: If we want to enable https downloads in the download worker, we need to change this
+                        if "s3://" in filter_url and (filter_url[-6:] in ["VV.tif", "VH.tif", "HH.tif", "HV.tif"]):
+                            filtered_urls.append(filter_url)
 
         batch_id_to_urls_map = defaultdict(list)
         batch_id_to_baseline_urls = defaultdict(list)
diff --git a/docker/job-spec.json.rtc_for_dist_query b/docker/job-spec.json.rtc_for_dist_query
@@ -1,8 +1,8 @@
 {
   "command":"/home/ops/verdi/ops/opera-pcm/data_subscriber/rtc_for_dist/rtc_for_dist_query.sh",
   "disk_usage":"1GB",
-  "soft_time_limit": 3600,
-  "time_limit": 3660,
+  "soft_time_limit": 9939,
+  "time_limit": 9999,
   "imported_worker_files": {
     "$HOST_VERDI_HOME/.netrc": "/home/ops/.netrc",
     "$HOST_VERDI_HOME/.aws": "/home/ops/.aws",
diff --git a/opera_commons/es_connection.py b/opera_commons/es_connection.py
@@ -1,12 +1,13 @@
 #!/usr/bin/env python
-from typing import Union
+from typing import Union, Any
+from urllib.parse import urlparse, unquote
 
 from hysds.celery import app
 from hysds_commons.elasticsearch_utils import ElasticsearchUtility
 from pcm_commons.query.ancillary_utility import AncillaryUtility
 from opera_commons.constants import product_metadata
 
-from .logger import logger as default_logger
+from opera_commons.logger import logger as default_logger
 
 
 GRQ_ES = None
@@ -79,6 +80,7 @@ def get_mozart_es(logger):
         es_cluster_mode = app.conf['ES_CLUSTER_MODE']
         if es_cluster_mode:
             hosts = [app.conf.JOBS_ES_URL, app.conf.GRQ_ES_URL, app.conf.METRICS_ES_URL]
+            hosts = _normalize_hosts(hosts)
         else:
             hosts = [app.conf.JOBS_ES_URL]
 
@@ -96,3 +98,34 @@ def get_mozart_es(logger):
                     retry_on_timeout=True,
                 )
     return MOZART_ES
+
+def _normalize_hosts(hosts: Any) -> Any:
+    out = []
+    # normalize hosts to dicts
+    for host in hosts:
+        if "://" not in host:
+            host = f"//{host}"  # type: ignore
+
+        parsed_url = urlparse(host)
+        h = {"host": parsed_url.hostname}
+
+        if parsed_url.port:
+            h["port"] = parsed_url.port
+        else:
+            h["port"] = 9200
+
+        if parsed_url.scheme == "https":
+            h["port"] = parsed_url.port or 443
+            h["use_ssl"] = True
+
+        if parsed_url.username or parsed_url.password:
+            h["http_auth"] = "{}:{}".format(
+                unquote(parsed_url.username),
+                unquote(parsed_url.password),
+            )
+
+        if parsed_url.path and parsed_url.path != "/":
+            h["url_prefix"] = parsed_url.path
+
+        out.append(h)
+    return out
diff --git a/tools/populate_cmr_rtc_cache.py b/tools/populate_cmr_rtc_cache.py
@@ -138,6 +138,7 @@ def populate_cmr_rtc_cache(granules: List[Dict[str, Any]], es_conn) -> None:
         
         # Prepare document for indexing
         doc = {
+            "@timestamp": datetime.now(),
             "granule_id": granule["granule_id"],
             "burst_id": granule["burst_id"],
             "acquisition_timestamp": granule["acquisition_timestamp"],