Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/implement daily bcsd #28

Merged
merged 39 commits into from
Nov 5, 2020
Merged
Show file tree
Hide file tree
Changes from 37 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
2a35885
first few steps of daily bcsd - custom grouper class
dgergel Oct 29, 2019
c1c31b8
add test notebook for daily bcsd
dgergel Oct 29, 2019
18068b0
fixing merge conflict after pulling in upstream master
dgergel Apr 2, 2020
1eb3b39
updating daily bcsd implementation to include monthly resampling to d…
dgergel Apr 2, 2020
990ddb2
remove old test notebook
dgergel Apr 2, 2020
9400f97
add updated notebook for testing daily bias correction
dgergel Apr 2, 2020
e281e8a
fix merge conflict
dgergel Jun 8, 2020
8c65c36
add old helper function to check datetime indices
dgergel Jun 9, 2020
4a3dfeb
fix naming in previous commit
dgergel Jun 9, 2020
eb090b2
add check_datetime_index import from .utils
dgergel Jun 9, 2020
cdd0838
DOY grouper fixes, still needs cleanup
dgergel Jun 23, 2020
f6a12da
cleanup bcsd temperature and bcsd class helper functions
dgergel Jun 24, 2020
84d3aa2
add bc model doy updates for precip
dgergel Jun 24, 2020
48f5bc9
clean up bcsdprecipitation model
dgergel Jun 24, 2020
6daa1ba
Merge branch 'master' of github.com:jhamman/scikit-downscale into fea…
dgergel Jun 24, 2020
8b6ef5a
remove test notebook
dgergel Jun 24, 2020
1ae1a6d
remove commented out code
dgergel Jun 24, 2020
1d9cdc5
fix formatting check error
dgergel Jun 25, 2020
1f58985
fix isort error
dgergel Jun 25, 2020
9036b9d
remove trailing whitespace
dgergel Jun 25, 2020
65fe1d6
fix persistent formatting check error
dgergel Jun 25, 2020
1f2948e
fix merge conflict
dgergel Sep 25, 2020
bfc051c
move DAY_GROUPER and MONTH_GROUPER to groupers module
dgergel Sep 25, 2020
3621ce4
fix tests/accidental mod from merge conflict
dgergel Sep 25, 2020
7a5455f
fix assertion error in failing tests
dgergel Sep 25, 2020
d41ebe7
fix tests, comment out validate_data fctn that isn't working atm
dgergel Sep 25, 2020
56162a6
updates to PaddedDOYGrouper from review suggestions
dgergel Sep 27, 2020
38ac4ab
add groupers classes to api docs
dgergel Sep 27, 2020
e75909d
add support for leap days into PaddedDOYGrouper
dgergel Sep 29, 2020
3ada0eb
further updates to leap year handling in PaddedDOYGrouper
dgergel Sep 29, 2020
ed932a7
attempt to fix failing CI tests
Sep 30, 2020
740befc
fix lint CI test
dgergel Sep 30, 2020
7e65248
fix lint CI test with pre-commit
dgergel Sep 30, 2020
acce768
quick bug fix to paddeddoygrouper
dgergel Oct 14, 2020
65db340
add unit tests for nasa-nex option and PaddedDOYGrouper
dgergel Oct 14, 2020
fc85b9d
fix linter errors
dgergel Oct 14, 2020
0e43bd0
fix flake8
dgergel Oct 14, 2020
b046af1
remove check_datetime_index fctn
dgergel Nov 3, 2020
f9ee410
remove fctn import
dgergel Nov 3, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,13 @@ Transformers

LinearTrendTransformer
QuantileMapper

Groupers
~~~~~~~~~~~~

.. autosummary::
:toctree: generated/

DAY_GROUPER
MONTH_GROUPER
PaddedDOYGrouper
1 change: 1 addition & 0 deletions skdownscale/pointwise_models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .bcsd import BcsdPrecipitation, BcsdTemperature
from .core import PointWiseDownscaler
from .gard import AnalogRegression, PureAnalog
from .groupers import DAY_GROUPER, MONTH_GROUPER, PaddedDOYGrouper
from .utils import LinearTrendTransformer, QuantileMapper
from .zscore import ZScoreRegressor
1 change: 1 addition & 0 deletions skdownscale/pointwise_models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def _validate_data(self, X, y=None, reset=True, validate_separately=False, **che
X, y = self._check_X_y(X, y, **check_params)
out = X, y

# TO-DO: add check_n_features attribute
if check_params.get('ensure_2d', True):
self._check_n_features(X, reset=reset)

Expand Down
144 changes: 98 additions & 46 deletions skdownscale/pointwise_models/bcsd.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,33 +5,58 @@
from sklearn.utils.validation import check_is_fitted

from .base import TimeSynchronousDownscaler
from .utils import QuantileMapper


def MONTH_GROUPER(x):
return x.month
from .groupers import DAY_GROUPER, MONTH_GROUPER, PaddedDOYGrouper
from .utils import QuantileMapper, check_datetime_index, ensure_samples_features


class BcsdBase(TimeSynchronousDownscaler):
""" Base class for BCSD model.
"""
"""Base class for BCSD model."""

_fit_attributes = ['y_climo_', 'quantile_mappers_']
_timestep = 'M'

def __init__(self, time_grouper=MONTH_GROUPER, return_anoms=True, qm_kwargs={}):
def __init__(
self,
time_grouper=MONTH_GROUPER,
climate_trend_grouper=DAY_GROUPER,
climate_trend=MONTH_GROUPER,
return_anoms=True,
qm_kwargs={},
):

self.time_grouper = time_grouper
self.climate_trend_grouper = climate_trend_grouper
self.climate_trend = climate_trend
self.return_anoms = return_anoms
self.qm_kwargs = qm_kwargs

def _pre_fit(self):
if isinstance(self.time_grouper, str):
self.time_grouper_ = pd.Grouper(freq=self.time_grouper)
if self.time_grouper == 'daily_nasa-nex':
self.time_grouper = PaddedDOYGrouper
self.timestep = 'daily'
else:
self.time_grouper_ = pd.Grouper(freq=self.time_grouper)
self.timestep = 'monthly'
else:
self.time_grouper_ = self.time_grouper
self.timestep = 'monthly'

def _create_groups(self, df, climate_trend=False):
"""helper function to create groups by either daily or month"""
if self.timestep == 'monthly':
return df.groupby(self.time_grouper)
elif self.timestep == 'daily':
if climate_trend:
# group by day only rather than also +/- offset days
return df.groupby(self.climate_trend_grouper)
else:
return self.time_grouper(df)
else:
raise TypeError('unexpected time grouper type %s' % self.time_grouper)

def _qm_fit_by_group(self, groups):
""" helper function to fit quantile mappers by group
"""helper function to fit quantile mappers by group

Note that we store these mappers for later
"""
Expand All @@ -40,7 +65,7 @@ def _qm_fit_by_group(self, groups):
self.quantile_mappers_[key] = QuantileMapper(**self.qm_kwargs).fit(group)

def _qm_transform_by_group(self, groups):
""" helper function to apply quantile mapping by group
"""helper function to apply quantile mapping by group

Note that we recombine the dataframes using pd.concat, there may be a better way to do this
"""
Expand All @@ -51,9 +76,22 @@ def _qm_transform_by_group(self, groups):
dfs.append(pd.DataFrame(qmapped, index=group.index, columns=group.columns))
return pd.concat(dfs).sort_index()

def _remove_climatology(self, obj, climatology, climate_trend=False):
"""helper function to remove climatologies"""
dfs = []
for key, group in self._create_groups(obj, climate_trend):
if self.timestep == 'monthly':
dfs.append(group - climatology.loc[key].values)
elif self.timestep == 'daily':
dfs.append(group - climatology.loc[key])

result = pd.concat(dfs).sort_index()
assert obj.shape == result.shape
return result


class BcsdPrecipitation(BcsdBase):
""" Classic BCSD model for Precipitation
"""Classic BCSD model for Precipitation

Parameters
----------
Expand All @@ -72,7 +110,7 @@ class BcsdPrecipitation(BcsdBase):
"""

def fit(self, X, y):
""" Fit BcsdPrecipitation model
"""Fit BcsdPrecipitation model

Parameters
----------
Expand All @@ -88,16 +126,19 @@ def fit(self, X, y):

self._pre_fit()
X, y = self._validate_data(X, y, y_numeric=True)
# TO-DO: set n_features_n attribute
if self.n_features_in_ != 1:
raise ValueError(f'BCSD only supports 1 feature, found {self.n_features_in_}')

y_groups = y.groupby(self.time_grouper)
y_groups = self._create_groups(y)
# calculate the climatologies
self.y_climo_ = y_groups.mean()

if self.y_climo_.values.min() <= 0:
raise ValueError('Invalid value in target climatology')

# fit the quantile mappers
# TO-DO: do we need to detrend the data before fitting the quantile mappers??
self._qm_fit_by_group(y_groups)

return self
Expand All @@ -119,23 +160,28 @@ def predict(self, X):
X = self._validate_data(X)

# Bias correction
# apply quantile mapping by month
Xqm = self._qm_transform_by_group(X.groupby(self.time_grouper))
# apply quantile mapping by month or day
Xqm = self._qm_transform_by_group(self._create_groups(X, climate_trend=True))

# calculate the anomalies as a ratio of the training data
if self.return_anoms:
return self._calc_ratio_anoms(Xqm, self.y_climo_)
else:
return Xqm

def _calc_ratio_anoms(self, obj, climatology):
def _calc_ratio_anoms(self, obj, climatology, climate_trend=False):
"""helper function for dividing day groups by climatology"""
dfs = []
for key, group in obj.groupby(self.time_grouper):
dfs.append(group / climatology.loc[key].values)
for key, group in self._create_groups(obj, climate_trend):
if self.timestep == 'monthly':
dfs.append(group / climatology.loc[key].values)
else:
dfs.append(group / climatology.loc[key])

result = pd.concat(dfs).sort_index()
assert obj.shape == result.shape

out = pd.concat(dfs).sort_index()
assert obj.shape == out.shape
return out
return result

def _more_tags(self):
return {
Expand All @@ -162,7 +208,7 @@ def _more_tags(self):

class BcsdTemperature(BcsdBase):
def fit(self, X, y):
""" Fit BcsdTemperature model
"""Fit BcsdTemperature model

Parameters
----------
Expand All @@ -175,14 +221,18 @@ def fit(self, X, y):
-------
self : returns an instance of self.
"""

self._pre_fit()
X, y = self._validate_data(X, y, y_numeric=True)
# TO-DO: set n_features_in attribute
if self.n_features_in_ != 1:
raise ValueError(f'BCSD only supports 1 feature, found {self.n_features_in_}')
raise ValueError(f'BCSD only supports up to 4 features, found {self.n_features_in_}')

# make groups for day or month
y_groups = self._create_groups(y)

# calculate the climatologies
self._x_climo = X.groupby(self.time_grouper).mean()
y_groups = y.groupby(self.time_grouper)
self._x_climo = self._create_groups(X).mean()
self.y_climo_ = y_groups.mean()

# fit the quantile mappers
Expand All @@ -191,7 +241,7 @@ def fit(self, X, y):
return self

def predict(self, X):
""" Predict using the BcsdTemperature model
"""Predict using the BcsdTemperature model

Parameters
----------
Expand All @@ -206,42 +256,44 @@ def predict(self, X):
check_is_fitted(self)
X = self._check_array(X)

# X = ensure_samples_features(X) # don't need????

# Calculate the 9-year running mean for each month
def rolling_func(x):
return x.rolling(9, center=True, min_periods=1).mean()

X_rolling_mean = X.groupby(self.time_grouper, group_keys=False).apply(rolling_func)
X_rolling_mean = X.groupby(self.climate_trend, group_keys=False).apply(rolling_func)

# calc shift
# why isn't this working??
# X_shift = X_rolling_mean.groupby(self.time_grouper) - self._x_climo
X_shift = self._remove_climatology(X_rolling_mean, self._x_climo)
# remove climatology from 9-year monthly mean climate trend
X_shift = self._remove_climatology(X_rolling_mean, self._x_climo, climate_trend=True)

# remove shift
# remove shift from model data
X_no_shift = X - X_shift

# Bias correction
# apply quantile mapping by month
Xqm = self._qm_transform_by_group(X_no_shift.groupby(self.time_grouper))
# apply quantile mapping by month or day
Xqm = self._qm_transform_by_group(self._create_groups(X_no_shift, climate_trend=True))

# restore the shift
# restore the climate trend
X_qm_with_shift = X_shift + Xqm
# calculate the anomalies

# return bias corrected absolute values or calculate the anomalies
if self.return_anoms:
return self._remove_climatology(X_qm_with_shift, self.y_climo_)
else:
return X_qm_with_shift

def _remove_climatology(self, obj, climatology):
def _remove_climatology(self, obj, climatology, climate_trend=False):
"""helper function to remove climatologies"""
dfs = []
for key, group in obj.groupby(self.time_grouper):
dfs.append(group - climatology.loc[key].values)

out = pd.concat(dfs).sort_index()
assert obj.shape == out.shape
return out
for key, group in self._create_groups(obj, climate_trend):
if self.timestep == 'monthly':
dfs.append(group - climatology.loc[key].values)
elif self.timestep == 'daily':
dfs.append(group - climatology.loc[key].values)

result = pd.concat(dfs).sort_index()
if obj.shape != result.shape:
raise ValueError('shape of climo is not equal to input array')
return result

def _more_tags(self):
return {
Expand Down
89 changes: 89 additions & 0 deletions skdownscale/pointwise_models/groupers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import warnings

import numpy as np
import pandas as pd


class SkdownscaleGroupGeneratorBase:
pass


def MONTH_GROUPER(x):
return x.month


def DAY_GROUPER(x):
return x.day


class PaddedDOYGrouper(SkdownscaleGroupGeneratorBase):
def __init__(self, df, offset=15):
self.n = 1
self.df = df
self.max = 366
# check for leap days
# if leap days present, flag for day groups count
if len(self.df[((self.df.index.month == 2) & (self.df.index.day == 29))]) > 0:
self.leap = 'leap'
else:
self.leap = 'noleap'
# split up data by leap and non leap years
# necessary because pandas dayofyear
self.df_leap = self.df[self.df.index.is_leap_year]
self.df_noleap = self.df[~self.df.index.is_leap_year]
self.offset = offset
self.days_of_nonleap_year = np.arange(self.n, self.max)
self.days_of_leap_year = np.arange(self.n, self.max + 1)
self.days_of_nonleap_year_wrapped = np.pad(
self.days_of_nonleap_year, self.offset, mode='wrap'
)
self.days_of_leap_year_wrapped = np.pad(self.days_of_leap_year, self.offset, mode='wrap')

def __iter__(self):
self.n = 1
return self

def __next__(self):
# n as day of year
if self.n > self.max:
raise StopIteration

i = self.n - 1
total_days = (2 * self.offset) + 1

# create day groups with +/- offset # of days
first_set_leap = self.days_of_leap_year_wrapped[i : i + self.offset]
first_set_noleap = self.days_of_nonleap_year_wrapped[i : i + self.offset]

sec_set_leap = self.days_of_leap_year_wrapped[self.n + self.offset : i + total_days]
sec_set_noleap = self.days_of_nonleap_year_wrapped[self.n + self.offset : i + total_days]

all_days_leap = np.concatenate((first_set_leap, np.array([self.n]), sec_set_leap), axis=0)
all_days_noleap = np.concatenate(
(first_set_noleap, np.array([self.n]), sec_set_noleap), axis=0
)

# check that day groups contain the correct number of days
if len(set(all_days_leap)) != total_days and self.leap == 'noleap':
warnings.warn('leap days not included, day groups in leap years missing leap days')

if len(set(all_days_noleap)) != total_days and self.n != 366:
raise ValueError('no leap day groups do not contain the correct set of days')

result = pd.concat(
[
self.df_leap[self.df_leap.index.dayofyear.isin(all_days_leap)],
self.df_noleap[self.df_noleap.index.dayofyear.isin(all_days_noleap)],
]
)

self.n += 1

return self.n - 1, result

def mean(self):
arr_means = np.full((self.max, 1), np.inf)
for key, group in self:
arr_means[key - 1] = group.mean().values[0]
result = pd.DataFrame(arr_means, index=self.days_of_leap_year)
return result
Loading