assimilation_code/modules/assimilation/normal_distribution_mod.f90

! DART software - Copyright UCAR. This open source software is provided
! by UCAR, "as is", without charge, subject to all terms of use at
! http://www.image.ucar.edu/DAReS/DART/DART_download

module normal_distribution_mod

use types_mod, only : r8, missing_r8, digits12, PI

use utilities_mod, only : E_ERR, E_MSG, error_handler

use distribution_params_mod, only : distribution_params_type, NORMAL_DISTRIBUTION

implicit none
private

public :: normal_cdf, inv_std_normal_cdf, inv_weighted_normal_cdf, test_normal, &
          normal_mean_variance, normal_mean_sd, inv_cdf, set_normal_params_from_ens

character(len=512)        :: errstring
character(len=*), parameter :: source = 'normal_distribution_mod.f90'

! These quantiles bracket the range over which inv_std_normal_cdf functions
! The test routines are confined to this range and values outside this are
! changed to these. Approximate correpsonding standard deviations are in 
! min_sd and max_sd and these are the range over which the test_normal functions.
! The max_sd is smaller in magnitude than the min_sd because the Fortran number
! model cannot represent numbers as close to 1 as it can to 0.
real(r8), parameter :: min_quantile = 0.0_r8,  max_quantile = 0.999999999999999_r8
real(r8), parameter :: min_sd = -30.0_r8, max_sd = 8.0_r8

contains 

!------------------------------------------------------------------------

subroutine test_normal

! This routine provides limited tests of the numerics in this module. It begins
! by comparing a handful of cases of the cdf to results from Matlab. It
! then tests the quality of the inverse cdf for a single mean/sd. Failing
! these tests suggests a serious problem. Passing them does not indicate that 
! there are acceptable results for all possible inputs. 

! Set number of equally spaced trials for the test F-1(F(x)) where F is the CDF.
integer, parameter :: num_trials = 10000000

integer  :: i, j
real(r8) :: sd, quantile, inv, max_diff(16), max_q(16), max_matlab_diff

! Comparative results for a handful of cases from MATLAB21a
real(r8) :: cdf_diff(7)
real(r8) :: mmean(7) = [0.0_r8, 1.0_r8, -1.0_r8, 0.0_r8, 0.0_r8, 0.0_r8, 0.5_r8]
real(r8) :: msd(7) = [0.5_r8, 1.0_r8, 2.0_r8, 4.0_r8, 5.0_r8, 6.0_r8, 0.25_r8]
real(r8) :: mx(7)     = [0.1_r8, 0.2_r8, 0.3_r8, 0.4_r8, 0.5_r8, 0.6_r8, 0.7_r8]
! Generated by matlab normcdf(mx, mmean, msd)
real(r8) :: mcdf(7) = [0.579259709439103_r8, 0.211855398583397_r8, 0.742153889194135_r8, &
                       0.539827837277029_r8, 0.539827837277029_r8, 0.539827837277029_r8, &
                       0.788144601416603_r8]
! Bounds for quantile inversion differences
real(r8) :: inv_diff_bound(16) = [1e-10_r8, 1e-10_r8, 1e-10_r8, 1e-10_r8, 1e-10_r8, &
                                  1e-9_r8, 1e-8_r8, 1e-7_r8, 1e-7_r8, 1e-6_r8, &
                                  1e-5_r8, 1e-4_r8, 1e-3_r8, 1e-2_r8, 1e-1_r8, 1e-0_r8]

! Compare to matlab 
! Absolute value of differences should be less than 1e-15
do i = 1, 7
   cdf_diff(i) = normal_cdf(mx(i), mmean(i), msd(i)) - mcdf(i)
end do
max_matlab_diff = maxval(abs(cdf_diff))
if(max_matlab_diff < 1.0e-15_r8) then
   write(*, *) 'Matlab Comparison Tests: PASS ', max_matlab_diff
else
   write(*, *) 'Matlab Comparison Tests: FAIL ', max_matlab_diff
endif

! Keep track of differences as function of quantile
max_diff = 0.0_r8
do j = 1, 16
   max_q(j) = 1.0_r8 - 0.1**j
enddo

! Test the inversion of the cdf over +/- 30 standard deviations around mean
do i = 1, num_trials + 1
   sd = min_sd + (i - 1.0_r8) * (max_sd - min_sd) / num_trials 
   quantile = normal_cdf(sd, 0.0_r8, 1.0_r8)
   inv = inv_std_normal_cdf(quantile)
   do j = 1, 16
      if(quantile < max_q(j)) then
         max_diff(j) = max(abs(sd-inv), max_diff(j))
      endif
   enddo
end do

do j = 1, 16
   if(max_diff(j) > inv_diff_bound(j)) then
     write(*, *) 'FAIL: Max inversion diff ', max_diff(j), ' > bound ', inv_diff_bound(j), &
        'for quantiles < ', max_q(j)
   else
     write(*, *) 'PASS: Max inversion diff ', max_diff(j), ' < bound ', inv_diff_bound(j), &
        'for quantiles < ', max_q(j)
   endif
end do

end subroutine test_normal

!------------------------------------------------------------------------

function normal_cdf_params(x, p)

real(r8)                                   :: normal_cdf_params
real(r8), intent(in)                       :: x
type(distribution_params_type), intent(in) :: p

! A translation routine that is required to use the generic cdf optimization routine
! Extracts the appropriate information from the distribution_params_type that is needed
! for a call to the function normal_cdf below. 

real(r8) :: mean, sd

mean = p%params(1);    sd = p%params(2)
normal_cdf_params = normal_cdf(x, mean, sd)

end function normal_cdf_params

!------------------------------------------------------------------------

function normal_cdf(x_in, mean, sd)

! Approximate cumulative distribution function for normal

real(r8)             :: normal_cdf
real(r8), intent(in) :: x_in
real(r8), intent(in) :: mean, sd

real(digits12) :: nx

! Convert to a standard normal
nx = (x_in - mean) / sd

if(nx < 0.0_digits12) then
   normal_cdf = 0.5_digits12 * erfc(-nx / sqrt(2.0_digits12))
else
   normal_cdf = 0.5_digits12 * (1.0_digits12 + erf(nx / sqrt(2.0_digits12)))
endif

end function normal_cdf

!------------------------------------------------------------------------

function inv_weighted_normal_cdf(alpha, mean, sd, q) result(x)

! Find the value of x for which the cdf of a N(mean, sd) multiplied times
! alpha has value q.

real(r8)              :: x
real(r8), intent(in)  :: alpha, mean, sd, q

real(r8) :: normalized_q

! VARIABLES THROUGHOUT NEED TO SWITCH TO DIGITS_12

! Can search in a standard normal, then multiply by sd at end and add mean
! Divide q by alpha to get the right place for weighted normal
normalized_q = q / alpha

! Find spot in standard normal
x = inv_std_normal_cdf(normalized_q)

! Add in the mean and normalize by sd
x = mean + x * sd

end function inv_weighted_normal_cdf


!------------------------------------------------------------------------

function approx_inv_normal_cdf_params(quantile, p)

real(r8)                                   :: approx_inv_normal_cdf_params
real(r8), intent(in)                       :: quantile
type(distribution_params_type), intent(in) :: p

! A translation routine that is required to use the generic first_guess for
! the cdf  optimization routine.
! Extracts the appropriate information from the distribution_params_type that is needed
! for a call to the function approx_inv_normal_cdf below (which is nothing).

approx_inv_normal_cdf_params = approx_inv_normal_cdf(quantile)

end function approx_inv_normal_cdf_params

!------------------------------------------------------------------------

function approx_inv_normal_cdf(quantile_in) result(x)

real(r8)             :: x
real(r8), intent(in) :: quantile_in

! This is used to get a good first guess for the search in inv_std_normal_cdf
! The params argument is not needed here but is required for consistency &
! with other distributions

! normal inverse
! translate from http://home.online.no/~pjacklam/notes/invnorm
! a routine written by john herrero

real(r8) :: quantile 
real(r8) :: quantile_low,quantile_high
real(r8) :: a1,a2,a3,a4,a5,a6
real(r8) :: b1,b2,b3,b4,b5
real(r8) :: c1,c2,c3,c4,c5,c6
real(r8) :: d1,d2,d3,d4
real(r8) :: r, s

! Truncate out of range quantiles, converts them to smallest positive number or largest number <1
! This solution is stable, but may lead to underflows being thrown. May want to 
! think of a better solution. 
quantile = quantile_in
if(quantile <= 0.0_r8) quantile = tiny(quantile_in)
if(quantile >= 1.0_r8) quantile = nearest(1.0_r8, -1.0_r8)

a1 = -39.69683028665376_digits12
a2 =  220.9460984245205_digits12
a3 = -275.9285104469687_digits12
a4 =  138.357751867269_digits12
a5 = -30.66479806614716_digits12
a6 =  2.506628277459239_digits12
b1 = -54.4760987982241_digits12
b2 =  161.5858368580409_digits12
b3 = -155.6989798598866_digits12
b4 =  66.80131188771972_digits12
b5 = -13.28068155288572_digits12
c1 = -0.007784894002430293_digits12
c2 = -0.3223964580411365_digits12
c3 = -2.400758277161838_digits12
c4 = -2.549732539343734_digits12
c5 =  4.374664141464968_digits12
c6 =  2.938163982698783_digits12
d1 =  0.007784695709041462_digits12
d2 =  0.3224671290700398_digits12
d3 =  2.445134137142996_digits12
d4 =  3.754408661907416_digits12
quantile_low  = 0.02425_digits12
quantile_high = 1_digits12 - quantile_low
! Split into an inner and two outer regions which have separate fits
if(quantile < quantile_low) then
   s = sqrt(-2.0_digits12 * log(quantile))
   x = (((((c1*s + c2)*s + c3)*s + c4)*s + c5)*s + c6) / &
      ((((d1*s + d2)*s + d3)*s + d4)*s + 1.0_digits12)
else if(quantile > quantile_high) then
   s = sqrt(-2.0_digits12 * log(1.0_digits12 - quantile))
   x = -(((((c1*s + c2)*s + c3)*s + c4)*s + c5)*s + c6) / &
      ((((d1*s + d2)*s + d3)*s + d4)*s + 1.0_digits12)
else
   s = quantile - 0.5_digits12
   r = s*s
   x = (((((a1*r + a2)*r + a3)*r + a4)*r + a5)*r + a6)*s / &
      (((((b1*r + b2)*r + b3)*r + b4)*r + b5)*r + 1.0_digits12)
endif

end function approx_inv_normal_cdf

!------------------------------------------------------------------------

function inv_std_normal_cdf_params(quantile, p) result(x)

real(r8)                                   :: x
real(r8), intent(in)                       :: quantile
type(distribution_params_type), intent(in) :: p

x = inv_cdf(quantile, normal_cdf_params, approx_inv_normal_cdf_params, p)


end function inv_std_normal_cdf_params

!------------------------------------------------------------------------

function inv_std_normal_cdf(quantile) result(x)

real(r8)              :: x
real(r8), intent(in)  :: quantile

! This naive Newton method is much more accurate than approx_inv_normal_cdf, especially
! for quantile values less than 0.5. 

! Given a quantile q, finds the value of x for which the standard normal cdf
! has approximately this quantile

! Where should the stupid p type come from
type(distribution_params_type) :: p
real(r8) :: mean, sd

! Set the mean and sd to 0 and 1 for standard normal
mean        = 0.0_r8;     sd          = 1.0_r8
p%params(1) = mean;       p%params(2) = sd

x = inv_std_normal_cdf_params(quantile, p)

end function inv_std_normal_cdf

!------------------------------------------------------------------------

function inv_cdf(quantile_in, cdf, first_guess, p) result(x)

interface
   function cdf(x, p)
      use types_mod, only : r8
      use distribution_params_mod, only : distribution_params_type
      real(r8)                                   :: cdf
      real(r8), intent(in)                       :: x
      type(distribution_params_type), intent(in) :: p
   end function
end interface

interface
   function first_guess(quantile, p)
      use types_mod, only : r8
      use distribution_params_mod, only : distribution_params_type
      real(r8)                                   :: first_guess
      real(r8), intent(in)                       :: quantile
      type(distribution_params_type), intent(in) :: p
   end function
end interface

real(r8)                                   :: x
real(r8), intent(in)                       :: quantile_in
type(distribution_params_type), intent(in) :: p

! This naive Newton method is much more accurate than approx_inv_normal_cdf, especially
! for quantile values less than 0.5. 

! Given a quantile q, finds the value of x for which the standard normal cdf
! has approximately this quantile

! Limit on the total iterations; Increasing this does not change any of the results
! that do not converge for the test_normal call on gfortran.
integer, parameter :: max_iterations = 50

! Limit on number of times to halve the increment; No deep thought.
integer, parameter :: max_half_iterations = 25

real(r8) :: quantile
real(r8) :: reltol, dq_dx, delta
real(r8) :: x_guess, q_guess, x_new, q_new, del_x, del_q, del_q_old, q_old
integer  :: iter, j

real(r8) :: lower_bound,   upper_bound
logical  :: bounded_below, bounded_above

! Extract the required information from the p type
bounded_below = p%bounded_below;   bounded_above = p%bounded_above
lower_bound   = p%lower_bound;     upper_bound   = p%upper_bound

quantile = quantile_in

! Do a test for illegal values on the quantile
if(quantile <  0.0_r8 .or. quantile > 1.0_r8) then
   ! Need an error message
   write(errstring, *) 'Illegal Quantile input', quantile
   call error_handler(E_ERR, 'inv_cdf', errstring, source)
endif

! If the distribution is bounded, quantiles at the limits have values at the bounds
if(bounded_below .and. quantile == 0.0_r8) then
   x = lower_bound
   return
endif
if(bounded_above .and. quantile == 1.0_r8) then
   x = upper_bound
   return
endif

! If input quantiles are outside the numerically supported range, move them to the extremes
quantile = min(quantile, max_quantile)
! code tests stably for many distributions with min_quantile of 0.0, could remove this
quantile = max(quantile, min_quantile)

! Get first guess from functional approximation
x_guess = first_guess(quantile, p)

! Evaluate the cdf
q_guess = cdf(x_guess, p)

del_q = q_guess - quantile

! Iterations of the Newton method to approximate the root
do iter = 1, max_iterations
   ! Analytically, the PDF is derivative of CDF but this can be numerically inaccurate for extreme values
   ! Use numerical derivatives of the CDF to get more accurate inversion
   ! These values for the delta for the approximation work with Gfortran
   delta = max(1e-8_r8, 1e-8_r8 * abs(x_guess))
   dq_dx = (cdf(x_guess + delta, p) - cdf(x_guess - delta, p)) / (2.0_r8 * delta)
   ! Derivative of 0 means we're not going anywhere else
   if(dq_dx <= 0.0_r8) then
      x = x_guess
      return
   endif
   
   ! Linear approximation for how far to move in x
   del_x = del_q / dq_dx
   x_new = x_guess - del_x

   ! Look for convergence; If the change in x is smaller than approximate precision 
   reltol = (epsilon(x_guess))**(0.75_r8)
   if(abs(del_x) <= reltol) then
      x = x_new
      return
   endif
    
   ! If we've gone too far, the new error will be bigger than the old; 
   ! Repeatedly half the distance until this is rectified 
   del_q_old = del_q
   q_new = cdf(x_new, p)
   do j = 1, max_half_iterations
      del_q = q_new - quantile
      if (abs(del_q) < abs(del_q_old)) then
         exit
      endif
      q_old = q_new
      x_new = (x_guess + x_new)/2.0_r8
      q_new = cdf(x_new, p)
      ! If q isn't changing, no point in continuing
      if(q_old == q_new) exit

   end do

   x_guess = x_new
end do

! For now, have switched a failed convergence to return the latest guess
! This has implications for stability of probit algorithms that require further study
! Not currently happening for any of the test cases on gfortran
x = x_new
write(errstring, *)  'Failed to converge for quantile ', quantile
call error_handler(E_MSG, 'inv_cdf', errstring, source)
!!!call error_handler(E_ERR, 'inv_cdf', errstring, source)

end function inv_cdf

!------------------------------------------------------------------------

function std_normal_pdf(x)

! Pdf of standard normal evaluated at x
real(r8) :: std_normal_pdf
real(r8), intent(in) :: x

std_normal_pdf = exp(-0.5_r8 * x**2) / (sqrt(2.0_r8 * PI))

end function std_normal_pdf

!------------------------------------------------------------------------

subroutine normal_mean_variance(x, num, mean, variance)

integer,  intent(in)  :: num
real(r8), intent(in)  :: x(num)
real(r8), intent(out) :: mean
real(r8), intent(out) :: variance

mean = sum(x) / num
variance  = sum((x - mean)**2) / (num - 1)

end subroutine normal_mean_variance

!------------------------------------------------------------------------

subroutine normal_mean_sd(x, num, mean, sd)

integer,  intent(in)  :: num
real(r8), intent(in)  :: x(num)
real(r8), intent(out) :: mean
real(r8), intent(out) :: sd

mean = sum(x) / num
sd  = sqrt(sum((x - mean)**2) / (num - 1))

end subroutine normal_mean_sd

!------------------------------------------------------------------------

subroutine set_normal_params_from_ens(ens, num, p)

integer,                        intent(in)                         :: num
real(r8),                       intent(in)                        :: ens(num)
type(distribution_params_type), intent(inout) :: p

! Set up the description of the normal distribution defined by the ensemble
p%distribution_type = NORMAL_DISTRIBUTION

! The two meaningful params are the mean and standard deviation
call normal_mean_sd(ens, num, p%params(1), p%params(2))


end subroutine set_normal_params_from_ens

!------------------------------------------------------------------------

end module normal_distribution_mod