MHKiT-Software · akeeste · Oct 30, 2023 · Aug 31, 2023 · Oct 3, 2023 · Oct 3, 2023
diff --git a/MATLAB/findPOT.m b/MATLAB/findPOT.m
@@ -0,0 +1,112 @@
+function [ excess,peak_ind,lambda,avg_sz,tau ] = findPOT( Hs,srate,windsz,thresh )
+%finds independent peaks over a specified threshold value for UNCENSORED Hs data.
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%When using all time records in a year:
+%   Input Hs should be an nx1 vector of time series data of siginificant wave
+%   heights. 
+%When using a subset of time records in a year (i.e. seasonal data, using
+%  Oct to Apr only, etc.:
+%   Input Hs should be an nx1 or 1xn cell array of time series data of significant wave
+%   heights, where each cell is a vector of time series data from a single season/a single specified
+%   time period. For example, if computing Hs return periods from a
+%   hindcast ranging from 2000 to 2010 using Oct through Apr data, Hs{1} is
+%   the time series from Oct 2000 to Apr 2001, Hs{2} is the time series
+%   from Oct 2001 to Apr 2002, etc
+%
+%Other Input:
+%   srate   - The sampling rate, in hours, of the data
+%   windsz  - The window size, in hours, to be used to ensure independent
+%               peaks
+%   thresh  - The threshold, in meters, to be used for determining peaks
+%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+%if full set is given, convert to cell array and compute # years in series
+if(~iscell(Hs))    
+    ny = size(Hs,1)*srate/(24*365.25);  %number of years in time series
+    Hs = {Hs};
+    n = 1; 
+else
+    %if cell array is given, the number of years/seasons represented by
+        %series is number of cells in array.
+    ny = numel(Hs);
+    n = ny;
+end
+
+%convert window size from 'hours between peaks' to 'observations between peaks'
+window = windsz/srate;
+excess = [];
+peak_ind = [];
+tau = 0;
+%loop through each season (only one when full dataset is used, jan - dec)
+for i = 1:n
+    %convert to vector for easier use
+    Hs_curr = Hs{i};
+    nv = max(size(Hs_curr));
+
+    %find all points above the threshold
+    ind = find(Hs_curr > thresh);
+
+    %avg time between storms
+    temp = diff(ind);
+    tau = tau + mean(temp(find(temp>windsz/2)));
+
+    nex = max(size(ind));
+    if(size(ind,1) > 0)
+        %identify start and end points of all groups of consecutive points above threshold
+        stpt = ind(1);
+        endpt = [];
+        for i = 2:nex - 1
+            if(ind(i + 1) - ind(i) > 1)
+                endpt = [endpt;ind(i)];
+                stpt = [stpt;ind(i + 1)];            
+            end
+        end
+        endpt = [endpt;ind(end)];
+        avg_sz = mean(endpt - stpt);
+        %set number clusters
+        nclust = size(endpt,1);
+
+        %find maxima for each cluster
+        peaks = NaN(nclust,2);
+        for i = 1:nclust
+            s = stpt(i);
+            e = endpt(i);
+            [peaks(i,1) tempi] = max(Hs_curr(s:e));
+            clustrang = s:e;
+            peaks(i,2) = clustrang(tempi);
+        end
+
+        %for independence, check that new peak is more than window from old
+        %peak. if time between is less than window, use only larger of two. 
+        peak_dist = diff(peaks(:,2));
+        if(min(size(find(peak_dist<window))) > 0)
+            newp = peaks(1,:);
+            for i = 2:nclust
+                if(peak_dist(i-1) > window)
+                    newp = [newp;peaks(i,:)];
+                else
+                    if(newp(end,1) > peaks(i,1))
+                        if(i < nclust)
+                            peak_dist(i) = peaks(i+1,2) - newp(end,2);   
+                        end
+                    else
+                        newp(end,:) = peaks(i,:);
+                    end
+                end
+            end
+
+            peaks = newp;
+        end
+
+        %compute excesses and index of peaks to return
+        excess = [excess;peaks(:,1) - thresh];
+        peak_ind = [peak_ind;peaks(:,2)];  
+    end
+end
+
+%compute average number of clusters per year/season/time frame
+lambda = max(size(excess))/ny;
+tau = tau/ny;
+end
+
diff --git a/MATLAB/threshold.m b/MATLAB/threshold.m
@@ -0,0 +1,105 @@
+function [pct, best_thresh] =  threshold(Hs,samp_rate)
+
+    nlags = floor(14*24/samp_rate);
+    [acf,lag] = xcorr(Hs - mean(Hs), nlags, 'coeff');
+    positive_lag = lag((nlags+1):end);
+    positive_acf = acf((nlags+1):end) % CM
+    below_thresh = find(acf((nlags+1):end) < 0.5);
+
+    %if it doesn't drop below 0.5 in first 14 days (nlags) then the user should
+    %double check their input OR should be fitting this manually to choose the
+    %window.
+    if isempty(below_thresh)
+        fprintf(strcat('ERROR: The acf does not drop below 0.5 in first 14 days, check inputs', ...
+                    ' and retry function, or fit manually.\n'))
+        HsR = NaN;
+        flags = true;
+        POT_info = {};
+        return
+    end
+
+    %set window size (in hours) to the time where acf dropped below 0.5
+    windsize = samp_rate*positive_lag(min(below_thresh));
+
+    %set flag if the window is wider than 4 days
+    if windsize > 24*4 - 1
+        fprintf('WARNING: The acf window is over 4 days. Check final fit.\n')
+        flags.large_window_size = true;
+    else
+        flags.large_window_size = false;
+    end
+
+    %save the window size in info structure
+    POT_info.window_size = windsize;
+
+    %clear nlags acf lag positive_lag below_thresh
+
+    distribution = 'GeneralizedPareto';
+    npar = 3;
+
+    %initialize the first set of thresholds to be tested as the percentiles
+    %ranging from 99 to 99.5 percentile
+    pct_step = 0.1;
+    thresh_pct = [99:pct_step:99.5];
+    thresh_test = prctile(Hs,thresh_pct);
+    keep_going = true;
+    current_best_thresh = -100;
+    flags.test_lambda_below_1 = false;
+
+    %loop through until the maximum R is found or the number of peaks per
+    %year chosen drops to ~1 per year (annual maxima method)
+    while(keep_going)
+        thresh_corr = NaN(size(thresh_test));
+        for i = 1:length(thresh_test)
+            %find peaks over the test threshold
+            thresh = thresh_test(i);
+
+            [ excess,peak_ind,lambda,avg_sz ] = findPOT( Hs,samp_rate,windsize,thresh );
+
+            %fit distribution
+
+            POTdist = fitdist(excess,distribution);
+            %get qq plot
+            QQ_plt = qqplot(excess,POTdist);
+            X_data = get(QQ_plt,'Xdata');
+            Y_data = get(QQ_plt,'Ydata');
+            %find correlation
+            thresh_corr(i) = corr(X_data{1}',Y_data{1}');
+
+            %if lambda falls below one, the sample set is now less than one
+            %peak per year and should not be used, (might as well use AM)
+            if(lambda < 1)
+                thresh_corr(i) = 0;
+                flags.test_lambda_below_1 = true;
+            end
+
+        end
+        %find threshold with maximum correlation
+        [~,max_i] = max(thresh_corr);
+
+        %if the threshold hasn't changed more than 0.05m OR the number of
+        %samples per year has dropped below 2 (~one per year), then quit
+        if(abs(current_best_thresh - thresh_test(max_i)) < 0.05 || lambda < 2)
+            best_thresh = thresh_test(max_i);
+            pct = thresh_pct(max_i);
+            keep_going = false;
+            if(lambda < 2)
+                flags.loop_stopped_by_lambda = true;
+            else
+                flags.loop_stopped_by_lambda = false;
+            end
+        %otherwise, find a finer range around the new maximum and loop again
+        else
+            current_best_thresh = thresh_test(max_i);
+            pct_step = pct_step/10;
+            if(max_i == length(thresh_test))
+                thresh_pct = thresh_pct(max_i-1):pct_step:thresh_pct(max_i)+5*pct_step;
+            elseif(max_i == 1)
+                thresh_pct = thresh_pct(max_i)-9*pct_step:pct_step:thresh_pct(max_i+1);
+            else
+                thresh_pct = thresh_pct(max_i-1):pct_step:thresh_pct(max_i+1);
+            end
+            thresh_test = prctile(Hs,thresh_pct);
+        end
+    end
+end
diff --git a/examples/data/loads/data_loads_hs.csv b/examples/data/loads/data_loads_hs.csv
diff --git a/mhkit/loads/extreme.py b/mhkit/loads/extreme.py
@@ -2,6 +2,7 @@
 import pandas as pd
 from scipy import stats
 from scipy import optimize
+from scipy import signal
 from mhkit.wave.resource import frequency_moment
 
 
@@ -157,6 +158,129 @@ def peaks_distribution_weibull_tail_fit(x):
     return peaks
 
 
+def automatic_hs_threshold(
+    peaks,
+    sampling_rate,
+    initial_threshold_range = (0.990, 0.995, 0.001),
+    max_refinement=5
+):
+    """
+    Find the best significant wave height threshold for the
+    peaks-over-threshold method.
+
+    This method was developed by:
+
+    > Neary, V. S., S. Ahn, B. E. Seng, M. N. Allahdadi, T. Wang, Z. Yang and R. He (2020).
+    > "Characterization of Extreme Wave Conditions for Wave Energy Converter Design and Project Risk Assessment.”
+    > J. Mar. Sci. Eng. 2020, 8(4), 289; https://doi.org/10.3390/jmse8040289.
+
+    please cite this paper if using this method.
+
+    After all thresholds in the initial range are evaluated, the search
+    range is refined around the optimal point until either (i) there
+    is minimal change from the previous refinement results, (ii) the
+    number of data points become smaller than about 1 per year, or (iii)
+    the maximum number of iterations is reached.
+
+    Parameters
+    ----------
+    peaks: np.array
+        Peak values of the response time-series
+    sampling_rate: float
+        Sampling rate in hours.
+    initial_threshold_range: tuple
+        Initial range of thresholds to search. Described as
+        (min, max, step).
+    max_refinement: int
+        Maximum number of times to refine the search range.
+
+    Returns
+    -------
+    best_threshold: float
+        Threshold that results in the best correlation.
+    """
+    assert isinstance(sampling_rate, (float, int)), (
+        'sampling_rate must be of type float')
+    assert isinstance(peaks, np.ndarray), 'peaks must be of type np.ndarray'
+    assert len(initial_threshold_range) == 3, (
+        'initial_threshold_range must be length 3')
+    assert isinstance(max_refinement, int)
+
+    range_min, range_max, range_step = initial_threshold_range
+    best_threshold = -1
+    years = len(peaks)/(365.25*24/sampling_rate)
+
+    def _peaks_over_threshold(peaks, threshold, sampling_rate):
+        threshold_unit = stats.scoreatpercentile(peaks, 100*threshold)
+        idx_peaks = np.arange(len(peaks))
+        idx_storm_peaks, storm_peaks = global_peaks(
+            idx_peaks, peaks-threshold_unit)
+        idx_storm_peaks = idx_storm_peaks.astype(int)
+
+        # Two storms that are close enough (within specified window) are
+        # considered the same storm, to ensure independence.
+        independent_storm_peaks = [storm_peaks[0],]
+        idx_independent_storm_peaks = [idx_storm_peaks[0],]
+        # check first 14 days to determine window size
+        nlags = int(14 * 24 / sampling_rate)
+        x = peaks - np.mean(peaks)
+        acf = signal.correlate(x, x, mode="full")
+        lag = signal.correlation_lags(len(x), len(x), mode="full")
+        idx_zero = np.argmax(lag==0)
+        positive_lag = lag[(idx_zero):(idx_zero+nlags+1)]
+        acf_positive = acf[(idx_zero):(idx_zero+nlags+1)] / acf[idx_zero]
+
+        window_size = sampling_rate * positive_lag[acf_positive<0.5][0]
+        # window size in "observations" instead of "hours" between peaks.
+        window = window_size / sampling_rate
+        # keep only independent storm peaks
+        for idx in idx_storm_peaks[1:]:
+            if (idx - idx_independent_storm_peaks[-1]) > window:
+                idx_independent_storm_peaks.append(idx)
+                independent_storm_peaks.append(peaks[idx])
+            elif peaks[idx] > independent_storm_peaks[-1]:
+                idx_independent_storm_peaks[-1] = idx
+                independent_storm_peaks[-1] = peaks[idx]
+
+        return independent_storm_peaks
+
+    for i in range(max_refinement):
+        thresholds = np.arange(range_min, range_max, range_step)
+        correlations = []
+
+        for threshold in thresholds:
+            distribution = stats.genpareto
+            over_threshold = _peaks_over_threshold(
+                peaks, threshold, sampling_rate)
+            rate_per_year = len(over_threshold) / years
+            if rate_per_year < 2:
+                break
+            distributions_parameters = distribution.fit(
+                over_threshold, floc=0.)
+            _, (_, _, correlation) = stats.probplot(
+                peaks, distributions_parameters, distribution, fit=True)
+            correlations.append(correlation)
+
+        max_i = np.argmax(correlations)
+        minimal_change = np.abs(best_threshold - thresholds[max_i]) < 0.0005
+        best_threshold = thresholds[max_i]
+        if minimal_change and i<max_refinement-1:
+            break
+        range_step /= 10
+        if max_i == len(thresholds)-1:
+            range_min = thresholds[max_i - 1]
+            range_max = thresholds[max_i] + 5*range_step
+        elif max_i == 0:
+            range_min = thresholds[max_i] - 9*range_step
+            range_max = thresholds[max_i + 1]
+        else:
+            range_min = thresholds[max_i-1]
+            range_max = thresholds[max_i+1]
+
+    best_threshold_unit = stats.scoreatpercentile(peaks, 100*best_threshold)
+    return best_threshold, best_threshold_unit
+
+
 def peaks_distribution_peaks_over_threshold(x, threshold=None):
     """
     Estimate the peaks distribution using the peaks over threshold

diff --git a/mhkit/tests/loads/test_loads.py b/mhkit/tests/loads/test_loads.py
@@ -218,6 +218,13 @@ def test_shortterm_extreme(self):
             ste = loads.extreme.ste(t, data, t_st, method)
             assert_allclose(ste.cdf(x), cdf_1)
 
+    def test_automatic_threshold(self):
+        filename = "data_loads_hs.csv"
+        data = np.loadtxt(os.path.join(datadir, filename), delimiter=",")
+        years = 2.97
+        pct, threshold = loads.extreme.automatic_hs_threshold(data, years)
+        assert np.isclose(pct, 0.9894099999999999)  # TODO
+        assert np.isclose(threshold, 1.027523048)  # TODO
 
 if __name__ == '__main__':
     unittest.main()