Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automatic threshold for peaks-over-threshold #268

Merged
merged 16 commits into from
Oct 30, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 112 additions & 0 deletions MATLAB/findPOT.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
function [ excess,peak_ind,lambda,avg_sz,tau ] = findPOT( Hs,srate,windsz,thresh )
%finds independent peaks over a specified threshold value for UNCENSORED Hs data.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%When using all time records in a year:
% Input Hs should be an nx1 vector of time series data of siginificant wave
% heights.
%When using a subset of time records in a year (i.e. seasonal data, using
% Oct to Apr only, etc.:
% Input Hs should be an nx1 or 1xn cell array of time series data of significant wave
% heights, where each cell is a vector of time series data from a single season/a single specified
% time period. For example, if computing Hs return periods from a
% hindcast ranging from 2000 to 2010 using Oct through Apr data, Hs{1} is
% the time series from Oct 2000 to Apr 2001, Hs{2} is the time series
% from Oct 2001 to Apr 2002, etc
%
%Other Input:
% srate - The sampling rate, in hours, of the data
% windsz - The window size, in hours, to be used to ensure independent
% peaks
% thresh - The threshold, in meters, to be used for determining peaks
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%if full set is given, convert to cell array and compute # years in series
if(~iscell(Hs))
ny = size(Hs,1)*srate/(24*365.25); %number of years in time series
Hs = {Hs};
n = 1;
else
%if cell array is given, the number of years/seasons represented by
%series is number of cells in array.
ny = numel(Hs);
n = ny;
end

%convert window size from 'hours between peaks' to 'observations between peaks'
window = windsz/srate;
excess = [];
peak_ind = [];
tau = 0;
%loop through each season (only one when full dataset is used, jan - dec)
for i = 1:n
%convert to vector for easier use
Hs_curr = Hs{i};
nv = max(size(Hs_curr));

%find all points above the threshold
ind = find(Hs_curr > thresh);

%avg time between storms
temp = diff(ind);
tau = tau + mean(temp(find(temp>windsz/2)));

nex = max(size(ind));
if(size(ind,1) > 0)
%identify start and end points of all groups of consecutive points above threshold
stpt = ind(1);
endpt = [];
for i = 2:nex - 1
if(ind(i + 1) - ind(i) > 1)
endpt = [endpt;ind(i)];
stpt = [stpt;ind(i + 1)];
end
end
endpt = [endpt;ind(end)];
avg_sz = mean(endpt - stpt);
%set number clusters
nclust = size(endpt,1);

%find maxima for each cluster
peaks = NaN(nclust,2);
for i = 1:nclust
s = stpt(i);
e = endpt(i);
[peaks(i,1) tempi] = max(Hs_curr(s:e));
clustrang = s:e;
peaks(i,2) = clustrang(tempi);
end

%for independence, check that new peak is more than window from old
%peak. if time between is less than window, use only larger of two.
peak_dist = diff(peaks(:,2));
if(min(size(find(peak_dist<window))) > 0)
newp = peaks(1,:);
for i = 2:nclust
if(peak_dist(i-1) > window)
newp = [newp;peaks(i,:)];
else
if(newp(end,1) > peaks(i,1))
if(i < nclust)
peak_dist(i) = peaks(i+1,2) - newp(end,2);
end
else
newp(end,:) = peaks(i,:);
end
end
end

peaks = newp;
end

%compute excesses and index of peaks to return
excess = [excess;peaks(:,1) - thresh];
peak_ind = [peak_ind;peaks(:,2)];
end
end

%compute average number of clusters per year/season/time frame
lambda = max(size(excess))/ny;
tau = tau/ny;
end

105 changes: 105 additions & 0 deletions MATLAB/threshold.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
function [pct, best_thresh] = threshold(Hs,samp_rate)

nlags = floor(14*24/samp_rate);
[acf,lag] = xcorr(Hs - mean(Hs), nlags, 'coeff');
positive_lag = lag((nlags+1):end);
positive_acf = acf((nlags+1):end) % CM
below_thresh = find(acf((nlags+1):end) < 0.5);

%if it doesn't drop below 0.5 in first 14 days (nlags) then the user should
%double check their input OR should be fitting this manually to choose the
%window.
if isempty(below_thresh)
fprintf(strcat('ERROR: The acf does not drop below 0.5 in first 14 days, check inputs', ...
' and retry function, or fit manually.\n'))
HsR = NaN;
flags = true;
POT_info = {};
return
end

%set window size (in hours) to the time where acf dropped below 0.5
windsize = samp_rate*positive_lag(min(below_thresh));

%set flag if the window is wider than 4 days
if windsize > 24*4 - 1
fprintf('WARNING: The acf window is over 4 days. Check final fit.\n')
flags.large_window_size = true;
else
flags.large_window_size = false;
end

%save the window size in info structure
POT_info.window_size = windsize;

%clear nlags acf lag positive_lag below_thresh

distribution = 'GeneralizedPareto';
npar = 3;

%initialize the first set of thresholds to be tested as the percentiles
%ranging from 99 to 99.5 percentile
pct_step = 0.1;
thresh_pct = [99:pct_step:99.5];
thresh_test = prctile(Hs,thresh_pct);
keep_going = true;
current_best_thresh = -100;
flags.test_lambda_below_1 = false;

%loop through until the maximum R is found or the number of peaks per
%year chosen drops to ~1 per year (annual maxima method)
while(keep_going)
thresh_corr = NaN(size(thresh_test));
for i = 1:length(thresh_test)
%find peaks over the test threshold
thresh = thresh_test(i);

[ excess,peak_ind,lambda,avg_sz ] = findPOT( Hs,samp_rate,windsize,thresh );

%fit distribution

POTdist = fitdist(excess,distribution);
%get qq plot
QQ_plt = qqplot(excess,POTdist);
X_data = get(QQ_plt,'Xdata');
Y_data = get(QQ_plt,'Ydata');
%find correlation
thresh_corr(i) = corr(X_data{1}',Y_data{1}');

%if lambda falls below one, the sample set is now less than one
%peak per year and should not be used, (might as well use AM)
if(lambda < 1)
thresh_corr(i) = 0;
flags.test_lambda_below_1 = true;
end

end
%find threshold with maximum correlation
[~,max_i] = max(thresh_corr);

%if the threshold hasn't changed more than 0.05m OR the number of
%samples per year has dropped below 2 (~one per year), then quit
if(abs(current_best_thresh - thresh_test(max_i)) < 0.05 || lambda < 2)
best_thresh = thresh_test(max_i);
pct = thresh_pct(max_i);
keep_going = false;
if(lambda < 2)
flags.loop_stopped_by_lambda = true;
else
flags.loop_stopped_by_lambda = false;
end
%otherwise, find a finer range around the new maximum and loop again
else
current_best_thresh = thresh_test(max_i);
pct_step = pct_step/10;
if(max_i == length(thresh_test))
thresh_pct = thresh_pct(max_i-1):pct_step:thresh_pct(max_i)+5*pct_step;
elseif(max_i == 1)
thresh_pct = thresh_pct(max_i)-9*pct_step:pct_step:thresh_pct(max_i+1);
else
thresh_pct = thresh_pct(max_i-1):pct_step:thresh_pct(max_i+1);
end
thresh_test = prctile(Hs,thresh_pct);
end
end
end
1 change: 1 addition & 0 deletions examples/data/loads/data_loads_hs.csv

Large diffs are not rendered by default.

124 changes: 124 additions & 0 deletions mhkit/loads/extreme.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pandas as pd
from scipy import stats
from scipy import optimize
from scipy import signal
from mhkit.wave.resource import frequency_moment


Expand Down Expand Up @@ -157,6 +158,129 @@ def peaks_distribution_weibull_tail_fit(x):
return peaks


def automatic_hs_threshold(
peaks,
sampling_rate,
initial_threshold_range = (0.990, 0.995, 0.001),
max_refinement=5
):
"""
Find the best significant wave height threshold for the
peaks-over-threshold method.

This method was developed by:

> Neary, V. S., S. Ahn, B. E. Seng, M. N. Allahdadi, T. Wang, Z. Yang and R. He (2020).
> "Characterization of Extreme Wave Conditions for Wave Energy Converter Design and Project Risk Assessment.”
> J. Mar. Sci. Eng. 2020, 8(4), 289; https://doi.org/10.3390/jmse8040289.

please cite this paper if using this method.

After all thresholds in the initial range are evaluated, the search
range is refined around the optimal point until either (i) there
is minimal change from the previous refinement results, (ii) the
number of data points become smaller than about 1 per year, or (iii)
the maximum number of iterations is reached.

Parameters
----------
peaks: np.array
Peak values of the response time-series
sampling_rate: float
Sampling rate in hours.
initial_threshold_range: tuple
Initial range of thresholds to search. Described as
(min, max, step).
max_refinement: int
Maximum number of times to refine the search range.

Returns
-------
best_threshold: float
Threshold that results in the best correlation.
"""
assert isinstance(sampling_rate, (float, int)), (
'sampling_rate must be of type float')
assert isinstance(peaks, np.ndarray), 'peaks must be of type np.ndarray'
assert len(initial_threshold_range) == 3, (
'initial_threshold_range must be length 3')
assert isinstance(max_refinement, int)
cmichelenstrofer marked this conversation as resolved.
Show resolved Hide resolved

range_min, range_max, range_step = initial_threshold_range
best_threshold = -1
years = len(peaks)/(365.25*24/sampling_rate)

def _peaks_over_threshold(peaks, threshold, sampling_rate):
threshold_unit = stats.scoreatpercentile(peaks, 100*threshold)
cmichelenstrofer marked this conversation as resolved.
Show resolved Hide resolved
idx_peaks = np.arange(len(peaks))
idx_storm_peaks, storm_peaks = global_peaks(
idx_peaks, peaks-threshold_unit)
idx_storm_peaks = idx_storm_peaks.astype(int)

# Two storms that are close enough (within specified window) are
# considered the same storm, to ensure independence.
independent_storm_peaks = [storm_peaks[0],]
idx_independent_storm_peaks = [idx_storm_peaks[0],]
# check first 14 days to determine window size
nlags = int(14 * 24 / sampling_rate)
x = peaks - np.mean(peaks)
acf = signal.correlate(x, x, mode="full")
lag = signal.correlation_lags(len(x), len(x), mode="full")
idx_zero = np.argmax(lag==0)
positive_lag = lag[(idx_zero):(idx_zero+nlags+1)]
acf_positive = acf[(idx_zero):(idx_zero+nlags+1)] / acf[idx_zero]

window_size = sampling_rate * positive_lag[acf_positive<0.5][0]
# window size in "observations" instead of "hours" between peaks.
window = window_size / sampling_rate
# keep only independent storm peaks
for idx in idx_storm_peaks[1:]:
if (idx - idx_independent_storm_peaks[-1]) > window:
idx_independent_storm_peaks.append(idx)
independent_storm_peaks.append(peaks[idx])
cmichelenstrofer marked this conversation as resolved.
Show resolved Hide resolved
elif peaks[idx] > independent_storm_peaks[-1]:
idx_independent_storm_peaks[-1] = idx
independent_storm_peaks[-1] = peaks[idx]
cmichelenstrofer marked this conversation as resolved.
Show resolved Hide resolved

return independent_storm_peaks

for i in range(max_refinement):
thresholds = np.arange(range_min, range_max, range_step)
correlations = []

for threshold in thresholds:
distribution = stats.genpareto
over_threshold = _peaks_over_threshold(
peaks, threshold, sampling_rate)
rate_per_year = len(over_threshold) / years
if rate_per_year < 2:
break
distributions_parameters = distribution.fit(
over_threshold, floc=0.)
_, (_, _, correlation) = stats.probplot(
peaks, distributions_parameters, distribution, fit=True)
correlations.append(correlation)

max_i = np.argmax(correlations)
minimal_change = np.abs(best_threshold - thresholds[max_i]) < 0.0005
best_threshold = thresholds[max_i]
if minimal_change and i<max_refinement-1:
break
range_step /= 10
if max_i == len(thresholds)-1:
range_min = thresholds[max_i - 1]
range_max = thresholds[max_i] + 5*range_step
elif max_i == 0:
range_min = thresholds[max_i] - 9*range_step
range_max = thresholds[max_i + 1]
else:
range_min = thresholds[max_i-1]
range_max = thresholds[max_i+1]

best_threshold_unit = stats.scoreatpercentile(peaks, 100*best_threshold)
cmichelenstrofer marked this conversation as resolved.
Show resolved Hide resolved
return best_threshold, best_threshold_unit


def peaks_distribution_peaks_over_threshold(x, threshold=None):
"""
Estimate the peaks distribution using the peaks over threshold
Expand Down
7 changes: 7 additions & 0 deletions mhkit/tests/loads/test_loads.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,13 @@ def test_shortterm_extreme(self):
ste = loads.extreme.ste(t, data, t_st, method)
assert_allclose(ste.cdf(x), cdf_1)

def test_automatic_threshold(self):
filename = "data_loads_hs.csv"
data = np.loadtxt(os.path.join(datadir, filename), delimiter=",")
years = 2.97
pct, threshold = loads.extreme.automatic_hs_threshold(data, years)
assert np.isclose(pct, 0.9894099999999999) # TODO
assert np.isclose(threshold, 1.027523048) # TODO

if __name__ == '__main__':
unittest.main()
Loading