Merge pull request #28 from dgergel/feature/implement_daily_bcsd

Feature/implement daily bcsd
pangeo-data · Nov 5, 2020 · e1981ae · e1981ae
2 parents 233b028 + f9ee410
commit e1981ae
Show file tree

Hide file tree

Showing 7 changed files with 246 additions and 53 deletions.
diff --git a/docs/api.rst b/docs/api.rst
@@ -38,3 +38,13 @@ Transformers
 
    LinearTrendTransformer
    QuantileMapper
+
+Groupers
+~~~~~~~~~~~~
+
+.. autosummary::
+   :toctree: generated/
+
+   DAY_GROUPER
+   MONTH_GROUPER
+   PaddedDOYGrouper
diff --git a/skdownscale/pointwise_models/__init__.py b/skdownscale/pointwise_models/__init__.py
@@ -1,5 +1,6 @@
 from .bcsd import BcsdPrecipitation, BcsdTemperature
 from .core import PointWiseDownscaler
 from .gard import AnalogRegression, PureAnalog
+from .groupers import DAY_GROUPER, MONTH_GROUPER, PaddedDOYGrouper
 from .utils import LinearTrendTransformer, QuantileMapper
 from .zscore import ZScoreRegressor
diff --git a/skdownscale/pointwise_models/base.py b/skdownscale/pointwise_models/base.py
@@ -83,6 +83,7 @@ def _validate_data(self, X, y=None, reset=True, validate_separately=False, **che
                 X, y = self._check_X_y(X, y, **check_params)
             out = X, y
 
+        # TO-DO: add check_n_features attribute
         if check_params.get('ensure_2d', True):
             self._check_n_features(X, reset=reset)
 

diff --git a/skdownscale/pointwise_models/bcsd.py b/skdownscale/pointwise_models/bcsd.py
@@ -5,33 +5,58 @@
 from sklearn.utils.validation import check_is_fitted
 
 from .base import TimeSynchronousDownscaler
-from .utils import QuantileMapper
-
-
-def MONTH_GROUPER(x):
-    return x.month
+from .groupers import DAY_GROUPER, MONTH_GROUPER, PaddedDOYGrouper
+from .utils import QuantileMapper, ensure_samples_features
 
 
 class BcsdBase(TimeSynchronousDownscaler):
-    """ Base class for BCSD model.
-    """
+    """Base class for BCSD model."""
 
     _fit_attributes = ['y_climo_', 'quantile_mappers_']
     _timestep = 'M'
 
-    def __init__(self, time_grouper=MONTH_GROUPER, return_anoms=True, qm_kwargs={}):
+    def __init__(
+        self,
+        time_grouper=MONTH_GROUPER,
+        climate_trend_grouper=DAY_GROUPER,
+        climate_trend=MONTH_GROUPER,
+        return_anoms=True,
+        qm_kwargs={},
+    ):
+
         self.time_grouper = time_grouper
+        self.climate_trend_grouper = climate_trend_grouper
+        self.climate_trend = climate_trend
         self.return_anoms = return_anoms
         self.qm_kwargs = qm_kwargs
 
     def _pre_fit(self):
         if isinstance(self.time_grouper, str):
-            self.time_grouper_ = pd.Grouper(freq=self.time_grouper)
+            if self.time_grouper == 'daily_nasa-nex':
+                self.time_grouper = PaddedDOYGrouper
+                self.timestep = 'daily'
+            else:
+                self.time_grouper_ = pd.Grouper(freq=self.time_grouper)
+                self.timestep = 'monthly'
         else:
             self.time_grouper_ = self.time_grouper
+            self.timestep = 'monthly'
+
+    def _create_groups(self, df, climate_trend=False):
+        """helper function to create groups by either daily or month"""
+        if self.timestep == 'monthly':
+            return df.groupby(self.time_grouper)
+        elif self.timestep == 'daily':
+            if climate_trend:
+                # group by day only rather than also +/- offset days
+                return df.groupby(self.climate_trend_grouper)
+            else:
+                return self.time_grouper(df)
+        else:
+            raise TypeError('unexpected time grouper type %s' % self.time_grouper)
 
     def _qm_fit_by_group(self, groups):
-        """ helper function to fit quantile mappers by group
+        """helper function to fit quantile mappers by group
 
         Note that we store these mappers for later
         """
@@ -40,7 +65,7 @@ def _qm_fit_by_group(self, groups):
             self.quantile_mappers_[key] = QuantileMapper(**self.qm_kwargs).fit(group)
 
     def _qm_transform_by_group(self, groups):
-        """ helper function to apply quantile mapping by group
+        """helper function to apply quantile mapping by group
 
         Note that we recombine the dataframes using pd.concat, there may be a better way to do this
         """
@@ -51,9 +76,22 @@ def _qm_transform_by_group(self, groups):
             dfs.append(pd.DataFrame(qmapped, index=group.index, columns=group.columns))
         return pd.concat(dfs).sort_index()
 
+    def _remove_climatology(self, obj, climatology, climate_trend=False):
+        """helper function to remove climatologies"""
+        dfs = []
+        for key, group in self._create_groups(obj, climate_trend):
+            if self.timestep == 'monthly':
+                dfs.append(group - climatology.loc[key].values)
+            elif self.timestep == 'daily':
+                dfs.append(group - climatology.loc[key])
+
+        result = pd.concat(dfs).sort_index()
+        assert obj.shape == result.shape
+        return result
+
 
 class BcsdPrecipitation(BcsdBase):
-    """ Classic BCSD model for Precipitation
+    """Classic BCSD model for Precipitation
 
     Parameters
     ----------
@@ -72,7 +110,7 @@ class BcsdPrecipitation(BcsdBase):
     """
 
     def fit(self, X, y):
-        """ Fit BcsdPrecipitation model
+        """Fit BcsdPrecipitation model
 
         Parameters
         ----------
@@ -88,16 +126,19 @@ def fit(self, X, y):
 
         self._pre_fit()
         X, y = self._validate_data(X, y, y_numeric=True)
+        # TO-DO: set n_features_n attribute
         if self.n_features_in_ != 1:
             raise ValueError(f'BCSD only supports 1 feature, found {self.n_features_in_}')
 
-        y_groups = y.groupby(self.time_grouper)
+        y_groups = self._create_groups(y)
         # calculate the climatologies
         self.y_climo_ = y_groups.mean()
+
         if self.y_climo_.values.min() <= 0:
             raise ValueError('Invalid value in target climatology')
 
         # fit the quantile mappers
+        # TO-DO: do we need to detrend the data before fitting the quantile mappers??
         self._qm_fit_by_group(y_groups)
 
         return self
@@ -119,23 +160,28 @@ def predict(self, X):
         X = self._validate_data(X)
 
         # Bias correction
-        # apply quantile mapping by month
-        Xqm = self._qm_transform_by_group(X.groupby(self.time_grouper))
+        # apply quantile mapping by month or day
+        Xqm = self._qm_transform_by_group(self._create_groups(X, climate_trend=True))
 
         # calculate the anomalies as a ratio of the training data
         if self.return_anoms:
             return self._calc_ratio_anoms(Xqm, self.y_climo_)
         else:
             return Xqm
 
-    def _calc_ratio_anoms(self, obj, climatology):
+    def _calc_ratio_anoms(self, obj, climatology, climate_trend=False):
+        """helper function for dividing day groups by climatology"""
         dfs = []
-        for key, group in obj.groupby(self.time_grouper):
-            dfs.append(group / climatology.loc[key].values)
+        for key, group in self._create_groups(obj, climate_trend):
+            if self.timestep == 'monthly':
+                dfs.append(group / climatology.loc[key].values)
+            else:
+                dfs.append(group / climatology.loc[key])
+
+        result = pd.concat(dfs).sort_index()
+        assert obj.shape == result.shape
 
-        out = pd.concat(dfs).sort_index()
-        assert obj.shape == out.shape
-        return out
+        return result
 
     def _more_tags(self):
         return {
@@ -162,7 +208,7 @@ def _more_tags(self):
 
 class BcsdTemperature(BcsdBase):
     def fit(self, X, y):
-        """ Fit BcsdTemperature model
+        """Fit BcsdTemperature model
 
         Parameters
         ----------
@@ -175,14 +221,18 @@ def fit(self, X, y):
         -------
         self : returns an instance of self.
         """
+
         self._pre_fit()
         X, y = self._validate_data(X, y, y_numeric=True)
+        # TO-DO: set n_features_in attribute
         if self.n_features_in_ != 1:
-            raise ValueError(f'BCSD only supports 1 feature, found {self.n_features_in_}')
+            raise ValueError(f'BCSD only supports up to 4 features, found {self.n_features_in_}')
+
+        # make groups for day or month
+        y_groups = self._create_groups(y)
 
         # calculate the climatologies
-        self._x_climo = X.groupby(self.time_grouper).mean()
-        y_groups = y.groupby(self.time_grouper)
+        self._x_climo = self._create_groups(X).mean()
         self.y_climo_ = y_groups.mean()
 
         # fit the quantile mappers
@@ -191,7 +241,7 @@ def fit(self, X, y):
         return self
 
     def predict(self, X):
-        """ Predict using the BcsdTemperature model
+        """Predict using the BcsdTemperature model
 
         Parameters
         ----------
@@ -206,42 +256,44 @@ def predict(self, X):
         check_is_fitted(self)
         X = self._check_array(X)
 
-        # X = ensure_samples_features(X)  # don't need????
-
         # Calculate the 9-year running mean for each month
         def rolling_func(x):
             return x.rolling(9, center=True, min_periods=1).mean()
 
-        X_rolling_mean = X.groupby(self.time_grouper, group_keys=False).apply(rolling_func)
+        X_rolling_mean = X.groupby(self.climate_trend, group_keys=False).apply(rolling_func)
 
-        # calc shift
-        # why isn't this working??
-        # X_shift = X_rolling_mean.groupby(self.time_grouper) - self._x_climo
-        X_shift = self._remove_climatology(X_rolling_mean, self._x_climo)
+        # remove climatology from 9-year monthly mean climate trend
+        X_shift = self._remove_climatology(X_rolling_mean, self._x_climo, climate_trend=True)
 
-        # remove shift
+        # remove shift from model data
         X_no_shift = X - X_shift
 
         # Bias correction
-        # apply quantile mapping by month
-        Xqm = self._qm_transform_by_group(X_no_shift.groupby(self.time_grouper))
+        # apply quantile mapping by month or day
+        Xqm = self._qm_transform_by_group(self._create_groups(X_no_shift, climate_trend=True))
 
-        # restore the shift
+        # restore the climate trend
         X_qm_with_shift = X_shift + Xqm
-        # calculate the anomalies
+
+        # return bias corrected absolute values or calculate the anomalies
         if self.return_anoms:
             return self._remove_climatology(X_qm_with_shift, self.y_climo_)
         else:
             return X_qm_with_shift
 
-    def _remove_climatology(self, obj, climatology):
+    def _remove_climatology(self, obj, climatology, climate_trend=False):
+        """helper function to remove climatologies"""
         dfs = []
-        for key, group in obj.groupby(self.time_grouper):
-            dfs.append(group - climatology.loc[key].values)
-
-        out = pd.concat(dfs).sort_index()
-        assert obj.shape == out.shape
-        return out
+        for key, group in self._create_groups(obj, climate_trend):
+            if self.timestep == 'monthly':
+                dfs.append(group - climatology.loc[key].values)
+            elif self.timestep == 'daily':
+                dfs.append(group - climatology.loc[key].values)
+
+        result = pd.concat(dfs).sort_index()
+        if obj.shape != result.shape:
+            raise ValueError('shape of climo is not equal to input array')
+        return result
 
     def _more_tags(self):
         return {

diff --git a/skdownscale/pointwise_models/groupers.py b/skdownscale/pointwise_models/groupers.py
@@ -0,0 +1,89 @@
+import warnings
+
+import numpy as np
+import pandas as pd
+
+
+class SkdownscaleGroupGeneratorBase:
+    pass
+
+
+def MONTH_GROUPER(x):
+    return x.month
+
+
+def DAY_GROUPER(x):
+    return x.day
+
+
+class PaddedDOYGrouper(SkdownscaleGroupGeneratorBase):
+    def __init__(self, df, offset=15):
+        self.n = 1
+        self.df = df
+        self.max = 366
+        # check for leap days
+        # if leap days present, flag for day groups count
+        if len(self.df[((self.df.index.month == 2) & (self.df.index.day == 29))]) > 0:
+            self.leap = 'leap'
+        else:
+            self.leap = 'noleap'
+        # split up data by leap and non leap years
+        # necessary because pandas dayofyear
+        self.df_leap = self.df[self.df.index.is_leap_year]
+        self.df_noleap = self.df[~self.df.index.is_leap_year]
+        self.offset = offset
+        self.days_of_nonleap_year = np.arange(self.n, self.max)
+        self.days_of_leap_year = np.arange(self.n, self.max + 1)
+        self.days_of_nonleap_year_wrapped = np.pad(
+            self.days_of_nonleap_year, self.offset, mode='wrap'
+        )
+        self.days_of_leap_year_wrapped = np.pad(self.days_of_leap_year, self.offset, mode='wrap')
+
+    def __iter__(self):
+        self.n = 1
+        return self
+
+    def __next__(self):
+        # n as day of year
+        if self.n > self.max:
+            raise StopIteration
+
+        i = self.n - 1
+        total_days = (2 * self.offset) + 1
+
+        # create day groups with +/- offset # of days
+        first_set_leap = self.days_of_leap_year_wrapped[i : i + self.offset]
+        first_set_noleap = self.days_of_nonleap_year_wrapped[i : i + self.offset]
+
+        sec_set_leap = self.days_of_leap_year_wrapped[self.n + self.offset : i + total_days]
+        sec_set_noleap = self.days_of_nonleap_year_wrapped[self.n + self.offset : i + total_days]
+
+        all_days_leap = np.concatenate((first_set_leap, np.array([self.n]), sec_set_leap), axis=0)
+        all_days_noleap = np.concatenate(
+            (first_set_noleap, np.array([self.n]), sec_set_noleap), axis=0
+        )
+
+        # check that day groups contain the correct number of days
+        if len(set(all_days_leap)) != total_days and self.leap == 'noleap':
+            warnings.warn('leap days not included, day groups in leap years missing leap days')
+
+        if len(set(all_days_noleap)) != total_days and self.n != 366:
+            raise ValueError('no leap day groups do not contain the correct set of days')
+
+        result = pd.concat(
+            [
+                self.df_leap[self.df_leap.index.dayofyear.isin(all_days_leap)],
+                self.df_noleap[self.df_noleap.index.dayofyear.isin(all_days_noleap)],
+            ]
+        )
+
+        self.n += 1
+
+        return self.n - 1, result
+
+    def mean(self):
+        arr_means = np.full((self.max, 1), np.inf)
+        for key, group in self:
+            arr_means[key - 1] = group.mean().values[0]
+        result = pd.DataFrame(arr_means, index=self.days_of_leap_year)
+        return result