From 8d3427fa634b082227b7379bd833bfa4d79d3a22 Mon Sep 17 00:00:00 2001 From: bvenn Date: Mon, 14 Nov 2022 21:02:57 +0100 Subject: [PATCH 01/10] add erfinv #238 --- src/FSharp.Stats/SpecialFunctions/Erf.fs | 35 +++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/src/FSharp.Stats/SpecialFunctions/Erf.fs b/src/FSharp.Stats/SpecialFunctions/Erf.fs index d2f6fdb5..9b4d0021 100644 --- a/src/FSharp.Stats/SpecialFunctions/Erf.fs +++ b/src/FSharp.Stats/SpecialFunctions/Erf.fs @@ -73,4 +73,37 @@ module Errorfunction = match x with | x when (infinity.Equals(x)) -> nan | x when ((-infinity).Equals(x)) -> infinity - | _ -> _erfcx x \ No newline at end of file + | _ -> _erfcx x + + //helper arrays for inverse error function + let private a = [|0.886226899; -1.645349621; 0.914624893; -0.140543331|] + let private b = [|1.; -2.118377725; 1.442710462; -0.329097515; 0.012229801|] + let private c = [|-1.970840454; -1.62490649; 3.429567803; 1.641345311|] + let private d = [|1.; 3.543889200; 1.637067800|] + + // inverse of error function. uses newton refinement; from https://libit.sourceforge.net/ + let inverf x = + let sqrtPi = sqrt Math.PI + + let z = abs x + + let r = + if z <= 0.7 then + let x2 = x * x + let r = z * (((a.[3] * x2 + a.[2]) * x2 + a.[1]) * x2 + a.[0]) + r / ((((b.[4] * x2 + b.[3]) * x2 + b.[2]) * x2 + b.[1]) * x2 + b.[0]) + else + let y = sqrt( -log((1. - z)/2.)) + let r = (((c.[3] * y + c.[2]) * y + c.[1]) * y + c.[0]) + r / ((d.[2] * y + d.[1]) * y + d.[0]) + + + let r' = r * float (sign x) + let z' = z * float (sign x) + + let r'' = r' - (Erf(r') - z')/(2./sqrtPi * exp(-r' * r')) + r'' - (Erf(r'') - z')/(2./sqrtPi *exp(-r'' * r'')) + + // inverse of complementary error function + let inverfc x = + inverf (1. - x) From 8145747dd366e4233cdc11bc81d50d950f6bfe36 Mon Sep 17 00:00:00 2001 From: bvenn Date: Mon, 14 Nov 2022 21:03:09 +0100 Subject: [PATCH 02/10] add qq plot --- src/FSharp.Stats/FSharp.Stats.fsproj | 1 + src/FSharp.Stats/Signal/QQPlot.fs | 30 ++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 src/FSharp.Stats/Signal/QQPlot.fs diff --git a/src/FSharp.Stats/FSharp.Stats.fsproj b/src/FSharp.Stats/FSharp.Stats.fsproj index ee417b24..8f1f3744 100644 --- a/src/FSharp.Stats/FSharp.Stats.fsproj +++ b/src/FSharp.Stats/FSharp.Stats.fsproj @@ -89,6 +89,7 @@ + diff --git a/src/FSharp.Stats/Signal/QQPlot.fs b/src/FSharp.Stats/Signal/QQPlot.fs new file mode 100644 index 00000000..9bf0c54c --- /dev/null +++ b/src/FSharp.Stats/Signal/QQPlot.fs @@ -0,0 +1,30 @@ +namespace FSharp.Stats.Signal + + +open FSharp.Stats +open System +open FSharp.Stats.SpecialFunctions +open FSharp.Stats.Quantile + +module QQPlot = + + /// computes the quantile quantile coordinates of two sample distributions. Uses default quantile (Quantile.mode) + let fromTwoSamples sampleA sampleB = + [0. .. 0.01 .. 1.] + |> List.map (fun quantile -> + mode quantile sampleA, + mode quantile sampleB + ) + /// computes the quantile quantile coordinates of a sample distributions against a normal distribution. Uses default quantile (Quantile.mode) + let fromSampleToGauss sample = + let standardizedData = + Signal.Normalization.zScoreTransformPopulation (vector sample) + + let inverseCDF x = + sqrt 2. * Errorfunction.inverf (2. * x - 1.) + + [0. .. 0.01 .. 1.] + |> List.map (fun quantile -> + inverseCDF quantile, + Quantile.mode quantile standardizedData + ) From f4adbdf1c99d7aa8772c3f35e89720ed1ba04dff Mon Sep 17 00:00:00 2001 From: bvenn Date: Mon, 14 Nov 2022 21:03:23 +0100 Subject: [PATCH 03/10] add qq plot docu #238 --- docs/Signal.fsx | 126 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 125 insertions(+), 1 deletion(-) diff --git a/docs/Signal.fsx b/docs/Signal.fsx index df45547d..b12571d2 100644 --- a/docs/Signal.fsx +++ b/docs/Signal.fsx @@ -540,4 +540,128 @@ fftChart (***hide***) fftChart |> GenericChart.toChartHTML -(***include-it-raw***) \ No newline at end of file +(***include-it-raw***) + +(** + +## QQ Plot + +QQ plots allow to compare two sample distributions if: + + - the underlying population distribution is unknown or if + - the relationship between two distributions should be evaluated in greater detail than just their estimated parameters + + +The most common use case is to compare two sample populations: + +*) + +//create samples +let rnd = System.Random() +let norm = FSharp.Stats.Distributions.ContinuousDistribution.normal 0. 1. + +///Example 1: Random samples from a standard normal distribution +let normalDistA = Array.init 200 (fun _ -> norm.Sample()) +let normalDistB = Array.init 200 (fun _ -> norm.Sample()) + +///Example 2: Random samples from values between 0 and 1 +let evenRandomA = Array.init 200 (fun _ -> rnd.NextDouble()) +let evenRandomB = Array.init 200 (fun _ -> rnd.NextDouble()) + + +//plots QQ plot from two sample populations +let plotFrom2Populations sampleA sampleB = + + //this is the main data plotted as x,y diagram + let qqChart = + Chart.Point (FSharp.Stats.Signal.QQPlot.fromTwoSamples sampleA sampleB ) + + let expectedLine = + let minimum = min (Quantile.mode 0. sampleA) (Quantile.mode 0. sampleB) + let maximum = max (Quantile.mode 1. sampleA) (Quantile.mode 1. sampleB) + [ + minimum,minimum + maximum,maximum + ] + |> Chart.Line + |> Chart.withTraceName "expected" + + [ + qqChart + expectedLine + ] + |> Chart.combine + |> Chart.withXAxisStyle "Quantiles sample A" + |> Chart.withYAxisStyle "Quantiles sample B" + |> Chart.withTemplate ChartTemplates.lightMirrored + +let myQQPlot = plotFrom2Populations normalDistA normalDistB + +(*** condition: ipynb ***) +#if IPYNB +myQQPlot +#endif // IPYNB + +(***hide***) +myQQPlot |> GenericChart.toChartHTML +(***include-it-raw***) + + +(** + +You also can plot the quantiles from a sample versus a normal distribution. Your data is z standardized prior to quantile determination + +*) + + +//plots QQ plot from a sample population against a standard normal distribution +let plotFromOneSampleGauss sampleA = + + let qqData = FSharp.Stats.Signal.QQPlot.fromSampleToGauss sampleA + + //this is the main data plotted as x,y diagram + let qqChart = + Chart.Point qqData + + let expectedLine = + let minimum = snd qqData.[0] + let maximum = qqData |> Seq.last |> snd + [ + minimum,minimum + maximum,maximum + ] + |> Chart.Line + |> Chart.withTraceName "expected" + + [ + qqChart + expectedLine + ] + |> Chart.combine + |> Chart.withXAxisStyle "Quantiles sample A" + |> Chart.withYAxisStyle "Quantiles sample B" + |> Chart.withTemplate ChartTemplates.lightMirrored + + +let myQQPlotOneSampleGauss = plotFromOneSampleGauss normalDistA +let myQQPlotOneSampleRandm = plotFromOneSampleGauss evenRandomA + +(*** condition: ipynb ***) +#if IPYNB +myQQPlotOneSampleGauss +#endif // IPYNB + +(***hide***) +myQQPlotOneSampleGauss |> GenericChart.toChartHTML +(***include-it-raw***) + +(*** condition: ipynb ***) +#if IPYNB +myQQPlotOneSampleRandm +#endif // IPYNB + +(***hide***) +myQQPlotOneSampleRandm |> GenericChart.toChartHTML +(***include-it-raw***) + + From 9e4d232efd81e506460232b79398303422abda39 Mon Sep 17 00:00:00 2001 From: Benedikt Venn Date: Wed, 16 Nov 2022 13:52:42 +0100 Subject: [PATCH 04/10] add inverf tests --- tests/FSharp.Stats.Tests/SpecialFunctions.fs | 28 ++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/FSharp.Stats.Tests/SpecialFunctions.fs b/tests/FSharp.Stats.Tests/SpecialFunctions.fs index f94c99cd..1e1bd132 100644 --- a/tests/FSharp.Stats.Tests/SpecialFunctions.fs +++ b/tests/FSharp.Stats.Tests/SpecialFunctions.fs @@ -457,6 +457,34 @@ let erfTests = testCase "erfcx(-infinity)" (fun _ -> Expect.isTrue (infinity.Equals(Errorfunction.erfcx -infinity)) "expected erfcx(-infinity) to be infinity" ) + //tested against R Pracma and WolframAlpha + testCase "inverf(0.01)" (fun _ -> + Expect.floatClose Accuracy.medium (Errorfunction.inverf 0.01) 0.00886250128095 "inverf returned insufficient approximation of the result" + ) + testCase "inverf(0.5)" (fun _ -> + Expect.floatClose Accuracy.medium (Errorfunction.inverf 0.5) 0.476936276204 "inverf returned insufficient approximation of the result" + ) + testCase "inverf(0.99)" (fun _ -> + Expect.floatClose Accuracy.medium (Errorfunction.inverf 0.99) 1.82138636772 "inverf returned insufficient approximation of the result" + ) + testCase "inverf(-0.95)" (fun _ -> + Expect.floatClose Accuracy.medium (Errorfunction.inverf -0.95) -1.38590382435 "inverf returned insufficient approximation of the result" + ) + testCase "inverf(1)" (fun _ -> + Expect.isTrue ((Errorfunction.inverf 1) = infinity) "inverf returned insufficient approximation of the result" + ) + testCase "inverf(-1)" (fun _ -> + Expect.isTrue ((Errorfunction.inverf -1) = -infinity) "inverf returned insufficient approximation of the result" + ) + testCase "inverf(0)" (fun _ -> + Expect.floatClose Accuracy.medium (Errorfunction.inverf 0.0) 0. "inverf returned insufficient approximation of the result" + ) + testCase "inverf(2)" (fun _ -> + Expect.isTrue (nan.Equals(Errorfunction.inverf 2)) "inverf returned insufficient approximation of the result" + ) + testCase "inverf(-2)" (fun _ -> + Expect.isTrue (nan.Equals(Errorfunction.inverf -2)) "inverf returned insufficient approximation of the result" + ) ] [] From a1a0e161dda63e2209d315f64ac74b5f5df9c6a8 Mon Sep 17 00:00:00 2001 From: Benedikt Venn Date: Wed, 16 Nov 2022 13:53:12 +0100 Subject: [PATCH 05/10] add quantile documentation --- FSharp.Stats.sln | 1 + docs/Intervals.fsx | 2 +- docs/ML.fsx | 2 +- docs/Quantiles.fsx | 272 +++++++++++++++++++++++ docs/Signal.fsx | 124 ----------- src/FSharp.Stats/SpecialFunctions/Erf.fs | 40 ++-- 6 files changed, 298 insertions(+), 143 deletions(-) create mode 100644 docs/Quantiles.fsx diff --git a/FSharp.Stats.sln b/FSharp.Stats.sln index ce24a0c4..2e4599fe 100644 --- a/FSharp.Stats.sln +++ b/FSharp.Stats.sln @@ -57,6 +57,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "docs", "docs", "{23F9FB2E-6 docs\Intervals.fsx = docs\Intervals.fsx docs\LinearAlgebra.fsx = docs\LinearAlgebra.fsx docs\Matrix_Vector.fsx = docs\Matrix_Vector.fsx + docs\Quantiles.fsx = docs\Quantiles.fsx docs\NuGet.config = docs\NuGet.config docs\Rank.fsx = docs\Rank.fsx docs\Signal.fsx = docs\Signal.fsx diff --git a/docs/Intervals.fsx b/docs/Intervals.fsx index e2bab840..867f84c7 100644 --- a/docs/Intervals.fsx +++ b/docs/Intervals.fsx @@ -1,7 +1,7 @@ (** --- title: Intervals -index: 19 +index: 20 category: Documentation categoryindex: 0 --- diff --git a/docs/ML.fsx b/docs/ML.fsx index 972bb4bc..9297df9f 100644 --- a/docs/ML.fsx +++ b/docs/ML.fsx @@ -1,7 +1,7 @@ (** --- title: Machine Learning -index: 19 +index: 21 category: Documentation categoryindex: 0 --- diff --git a/docs/Quantiles.fsx b/docs/Quantiles.fsx new file mode 100644 index 00000000..99485b78 --- /dev/null +++ b/docs/Quantiles.fsx @@ -0,0 +1,272 @@ +(** +--- +title: Quantile +index: 19 +category: Documentation +categoryindex: 0 +--- +*) + +(*** hide ***) + +(*** condition: prepare ***) +#I "../src/FSharp.Stats/bin/Release/netstandard2.0/" +#r "FSharp.Stats.dll" +#r "nuget: Plotly.NET, 2.0.0-preview.16" + +(*** condition: ipynb ***) +#if IPYNB +#r "nuget: Plotly.NET, 2.0.0-preview.16" +#r "nuget: Plotly.NET.Interactive, 2.0.0-preview.16" +#r "nuget: FSharp.Stats" +#endif // IPYNB + + +open Plotly.NET +open Plotly.NET.StyleParam +open Plotly.NET.LayoutObjects + +//some axis styling +module Chart = + let myAxis name = LinearAxis.init(Title=Title.init name,Mirror=StyleParam.Mirror.All,Ticks=StyleParam.TickOptions.Inside,ShowGrid=false,ShowLine=true) + let myAxisRange name (min,max) = LinearAxis.init(Title=Title.init name,Range=Range.MinMax(min,max),Mirror=StyleParam.Mirror.All,Ticks=StyleParam.TickOptions.Inside,ShowGrid=false,ShowLine=true) + let withAxisTitles x y chart = + chart + |> Chart.withTemplate ChartTemplates.lightMirrored + |> Chart.withXAxis (myAxis x) + |> Chart.withYAxis (myAxis y) + +(** + +# Quantile + +[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/fslaborg/FSharp.Stats/gh-pages?filepath=Quantile.ipynb) + +_Summary:_ this tutorial demonstrates how to handle quantiles and QQ-Plots + + +## QQ Plot + +QQ plots allow to compare two sample distributions if: + + - the underlying population distribution is unknown or if + - the relationship between two distributions should be evaluated in greater detail than just their estimated parameters. + + +### Comparing two sample distributions + +The most common use case is to compare two sample populations: + +Lets create four samples of size 300 first: + + - two that are drawn from a normal distribution + + - two that are drawn randomly between 0 and 1 + +*) + +open FSharp.Stats +open FSharp.Stats.Quantile + + +//create samples +let rnd = System.Random() +let norm = Distributions.ContinuousDistribution.normal 0. 1. + +///Example 1: Aamples from a standard normal distribution +let normalDistA = Array.init 300 (fun _ -> norm.Sample()) +let normalDistB = Array.init 300 (fun _ -> norm.Sample()) + +///Example 2: Random samples from values between 0 and 1 +let evenRandomA = Array.init 300 (fun _ -> rnd.NextDouble()) +let evenRandomB = Array.init 300 (fun _ -> rnd.NextDouble()) + +let exampleDistributions = + [ + Chart.Histogram(normalDistA,Name="normalDistA") |> Chart.withTemplate ChartTemplates.lightMirrored + Chart.Histogram(normalDistB,Name="normalDistB") |> Chart.withTemplate ChartTemplates.lightMirrored + Chart.Histogram(evenRandomA,Name="evenRandomA") |> Chart.withTemplate ChartTemplates.lightMirrored + Chart.Histogram(evenRandomB,Name="evenRandomB") |> Chart.withTemplate ChartTemplates.lightMirrored + ] + |> Chart.Grid(2,2) + |> Chart.withSize(800.,700.) + +(*** condition: ipynb ***) +#if IPYNB +exampleDistributions +#endif // IPYNB + +(***hide***) +exampleDistributions |> GenericChart.toChartHTML +(***include-it-raw***) + +(** + +To compare if two distributions are equal or to identify ranges in which the distributions differ the 100 quantiles from each of the two distributions can be calculated and plotted against each other. +If both distributions are similar, you would expect the quantiles to be identical and therefore are located on the bisector of the QQ-Plot. + +Lets calculate the quantiles from _normalDistA_ vs _normalDistB_ +*) + +// Here a tuple sequence is generated that pairwise contain the same quantiles from normalDistA and normalDistB +let qqData = Signal.QQPlot.fromTwoSamples normalDistA normalDistB + +// Lets check out the first 5 elements in the sequence +qqData.[0..4] +(***include-it-raw***) + +(** + +You can use this tuple sequence and plot it against each other. The diagonal line indicates the bisector where perfect matches would be located. + +*) + +//plots QQ plot from two sample populations +let plotFrom2Populations sampleA sampleB = + + //this is the main data plotted as x,y diagram + let qqData = + Signal.QQPlot.fromTwoSamples sampleA sampleB + + //for a perfect match, all points should be located on the bisector + let expectedLine = + let minimum = min (Quantile.mode 0. sampleA) (Quantile.mode 0. sampleB) + let maximum = max (Quantile.mode 1. sampleA) (Quantile.mode 1. sampleB) + [ + minimum,minimum + maximum,maximum + ] + |> Chart.Line + |> Chart.withTraceName "expected" + + [ + Chart.Point (qqData,Name="QQ") + expectedLine + ] + |> Chart.combine + |> Chart.withXAxisStyle "Quantiles sample A" + |> Chart.withYAxisStyle "Quantiles sample B" + |> Chart.withTemplate ChartTemplates.lightMirrored + +let myQQPlot = plotFrom2Populations normalDistA normalDistB + + +(*** condition: ipynb ***) +#if IPYNB +myQQPlot +#endif // IPYNB + +(***hide***) +myQQPlot |> GenericChart.toChartHTML +(***include-it-raw***) + + +(** + +The both samples were taken from the same normal distribution and therefore they match pretty well. + +### Comparing a sample against a normal distribution + +You also can plot the quantiles from a sample versus a normal distribution. Your data is z standardized prior to quantile determination to have zero mean and unit variance. + +*) + +let multipleQQPlots = + [ + plotFrom2Populations normalDistA normalDistB |> Chart.withXAxisStyle "normalA" |> Chart.withYAxisStyle "normalB" + plotFrom2Populations normalDistA evenRandomB |> Chart.withXAxisStyle "normalA" |> Chart.withYAxisStyle "randomB" + plotFrom2Populations evenRandomA normalDistB |> Chart.withXAxisStyle "randomA" |> Chart.withYAxisStyle "normalB" + plotFrom2Populations evenRandomA evenRandomB |> Chart.withXAxisStyle "randomA" |> Chart.withYAxisStyle "randomB" + ] + |> Chart.Grid(2,2) + |> Chart.withLegend false + |> Chart.withSize(800.,700.) + +(*** condition: ipynb ***) +#if IPYNB +multipleQQPlots +#endif // IPYNB + +(***hide***) +multipleQQPlots |> GenericChart.toChartHTML +(***include-it-raw***) + + +(** + +When QQ-plots are generated for pairwise comparisons, it is obvious, that the random-random and normal-normal samples fit nicely. The cross comparisons between normal and random samples do not match. +Its easy to see that the random smaples are distributed between 0 and 1 while the samples from the normal distributions range from ~-2 to ~2 + +*) + + + +//The raw qq-plot data of a standard normal distribution and the sample distribution +let qqData sample = FSharp.Stats.Signal.QQPlot.fromSampleToGauss sample + + +//plots QQ plot from a sample population against a standard normal distribution +let plotFromOneSampleGauss sample = + + //this is the main data plotted as x,y diagram + let qqData = FSharp.Stats.Signal.QQPlot.fromSampleToGauss sample + + let qqChart = + Chart.Point qqData + + let expectedLine = + let minimum = snd qqData.[0] + let maximum = qqData |> Seq.last |> snd + [ + minimum,minimum + maximum,maximum + ] + |> Chart.Line + |> Chart.withTraceName "expected" + + [ + qqChart + expectedLine + ] + |> Chart.combine + |> Chart.withXAxisStyle "Quantiles sample" + |> Chart.withYAxisStyle "Quantiles gauss" + |> Chart.withTemplate ChartTemplates.lightMirrored + + +let myQQPlotOneSampleGauss = plotFromOneSampleGauss normalDistA + +(*** condition: ipynb ***) +#if IPYNB +myQQPlotOneSampleGauss +#endif // IPYNB + +(***hide***) +myQQPlotOneSampleGauss |> GenericChart.toChartHTML +(***include-it-raw***) + + + +(** + +As seen above the sample perfectly matches the expected quantiles from a normal distribution. This is expected because the sample was generated by sampling from an normal distribution. + +*) + +let myQQPlotOneSampleRandm = plotFromOneSampleGauss evenRandomA + +(*** condition: ipynb ***) +#if IPYNB +myQQPlotOneSampleRandm +#endif // IPYNB + +(***hide***) +myQQPlotOneSampleRandm |> GenericChart.toChartHTML +(***include-it-raw***) + + +(** + +As seen above the sample does not matches the expected quantiles from a normal distribution. The sample derives from an random sampling between 0 and 1 and therefore is overrepresented in the tails. + +*) \ No newline at end of file diff --git a/docs/Signal.fsx b/docs/Signal.fsx index b12571d2..e4f83a17 100644 --- a/docs/Signal.fsx +++ b/docs/Signal.fsx @@ -541,127 +541,3 @@ fftChart (***hide***) fftChart |> GenericChart.toChartHTML (***include-it-raw***) - -(** - -## QQ Plot - -QQ plots allow to compare two sample distributions if: - - - the underlying population distribution is unknown or if - - the relationship between two distributions should be evaluated in greater detail than just their estimated parameters - - -The most common use case is to compare two sample populations: - -*) - -//create samples -let rnd = System.Random() -let norm = FSharp.Stats.Distributions.ContinuousDistribution.normal 0. 1. - -///Example 1: Random samples from a standard normal distribution -let normalDistA = Array.init 200 (fun _ -> norm.Sample()) -let normalDistB = Array.init 200 (fun _ -> norm.Sample()) - -///Example 2: Random samples from values between 0 and 1 -let evenRandomA = Array.init 200 (fun _ -> rnd.NextDouble()) -let evenRandomB = Array.init 200 (fun _ -> rnd.NextDouble()) - - -//plots QQ plot from two sample populations -let plotFrom2Populations sampleA sampleB = - - //this is the main data plotted as x,y diagram - let qqChart = - Chart.Point (FSharp.Stats.Signal.QQPlot.fromTwoSamples sampleA sampleB ) - - let expectedLine = - let minimum = min (Quantile.mode 0. sampleA) (Quantile.mode 0. sampleB) - let maximum = max (Quantile.mode 1. sampleA) (Quantile.mode 1. sampleB) - [ - minimum,minimum - maximum,maximum - ] - |> Chart.Line - |> Chart.withTraceName "expected" - - [ - qqChart - expectedLine - ] - |> Chart.combine - |> Chart.withXAxisStyle "Quantiles sample A" - |> Chart.withYAxisStyle "Quantiles sample B" - |> Chart.withTemplate ChartTemplates.lightMirrored - -let myQQPlot = plotFrom2Populations normalDistA normalDistB - -(*** condition: ipynb ***) -#if IPYNB -myQQPlot -#endif // IPYNB - -(***hide***) -myQQPlot |> GenericChart.toChartHTML -(***include-it-raw***) - - -(** - -You also can plot the quantiles from a sample versus a normal distribution. Your data is z standardized prior to quantile determination - -*) - - -//plots QQ plot from a sample population against a standard normal distribution -let plotFromOneSampleGauss sampleA = - - let qqData = FSharp.Stats.Signal.QQPlot.fromSampleToGauss sampleA - - //this is the main data plotted as x,y diagram - let qqChart = - Chart.Point qqData - - let expectedLine = - let minimum = snd qqData.[0] - let maximum = qqData |> Seq.last |> snd - [ - minimum,minimum - maximum,maximum - ] - |> Chart.Line - |> Chart.withTraceName "expected" - - [ - qqChart - expectedLine - ] - |> Chart.combine - |> Chart.withXAxisStyle "Quantiles sample A" - |> Chart.withYAxisStyle "Quantiles sample B" - |> Chart.withTemplate ChartTemplates.lightMirrored - - -let myQQPlotOneSampleGauss = plotFromOneSampleGauss normalDistA -let myQQPlotOneSampleRandm = plotFromOneSampleGauss evenRandomA - -(*** condition: ipynb ***) -#if IPYNB -myQQPlotOneSampleGauss -#endif // IPYNB - -(***hide***) -myQQPlotOneSampleGauss |> GenericChart.toChartHTML -(***include-it-raw***) - -(*** condition: ipynb ***) -#if IPYNB -myQQPlotOneSampleRandm -#endif // IPYNB - -(***hide***) -myQQPlotOneSampleRandm |> GenericChart.toChartHTML -(***include-it-raw***) - - diff --git a/src/FSharp.Stats/SpecialFunctions/Erf.fs b/src/FSharp.Stats/SpecialFunctions/Erf.fs index 9b4d0021..4b5d129c 100644 --- a/src/FSharp.Stats/SpecialFunctions/Erf.fs +++ b/src/FSharp.Stats/SpecialFunctions/Erf.fs @@ -81,29 +81,35 @@ module Errorfunction = let private c = [|-1.970840454; -1.62490649; 3.429567803; 1.641345311|] let private d = [|1.; 3.543889200; 1.637067800|] - // inverse of error function. uses newton refinement; from https://libit.sourceforge.net/ + /// inverse of error function. uses newton refinement; from https://libit.sourceforge.net/ + /// accuracy to the fifth digit let inverf x = - let sqrtPi = sqrt Math.PI + match x with + | x when x = -1. -> -infinity + | x when x = 1. -> infinity + | _ -> + let sqrtPi = sqrt Math.PI - let z = abs x + let z = abs x - let r = - if z <= 0.7 then - let x2 = x * x - let r = z * (((a.[3] * x2 + a.[2]) * x2 + a.[1]) * x2 + a.[0]) - r / ((((b.[4] * x2 + b.[3]) * x2 + b.[2]) * x2 + b.[1]) * x2 + b.[0]) - else - let y = sqrt( -log((1. - z)/2.)) - let r = (((c.[3] * y + c.[2]) * y + c.[1]) * y + c.[0]) - r / ((d.[2] * y + d.[1]) * y + d.[0]) + let r = + if z <= 0.7 then + let x2 = x * x + let r = z * (((a.[3] * x2 + a.[2]) * x2 + a.[1]) * x2 + a.[0]) + r / ((((b.[4] * x2 + b.[3]) * x2 + b.[2]) * x2 + b.[1]) * x2 + b.[0]) + else + let y = sqrt( -log((1. - z)/2.)) + let r = (((c.[3] * y + c.[2]) * y + c.[1]) * y + c.[0]) + r / ((d.[2] * y + d.[1]) * y + d.[0]) - let r' = r * float (sign x) - let z' = z * float (sign x) + let r' = r * float (sign x) + let z' = z * float (sign x) - let r'' = r' - (Erf(r') - z')/(2./sqrtPi * exp(-r' * r')) - r'' - (Erf(r'') - z')/(2./sqrtPi *exp(-r'' * r'')) + let r'' = r' - (Erf(r') - z')/(2./sqrtPi * exp(-r' * r')) + r'' - (Erf(r'') - z')/(2./sqrtPi *exp(-r'' * r'')) - // inverse of complementary error function + /// inverse of complementary error function + /// accuracy to the fifth digit let inverfc x = inverf (1. - x) From 7ca37e77423cd4b7029a23de9a35b0c4a26e3ae8 Mon Sep 17 00:00:00 2001 From: bvenn Date: Wed, 16 Nov 2022 20:58:02 +0100 Subject: [PATCH 06/10] update qq plot --- docs/Quantiles.fsx | 67 ++++++++++++++++++++++++++++--- src/FSharp.Stats/Signal/QQPlot.fs | 43 ++++++++++++++++++++ 2 files changed, 105 insertions(+), 5 deletions(-) diff --git a/docs/Quantiles.fsx b/docs/Quantiles.fsx index 99485b78..21e99736 100644 --- a/docs/Quantiles.fsx +++ b/docs/Quantiles.fsx @@ -44,6 +44,65 @@ module Chart = _Summary:_ this tutorial demonstrates how to handle quantiles and QQ-Plots +## Quantiles + +Quantiles are values that divide data into equally spaced groups. Percentiles are just quantiles that divide the data in 100 equally sized groups. +The median for example defines the 0.5 quantile or 0.5 percentile. You can calculate the quantile by calculating how many values are less than the value you are interested in. + +There are many possibilities to handle ties or data that cannot be split equally. The default quantile version used in R is ´Quantile.mode´. + +Lets sample 1000 data points from a normal distribution and calculate some percentiles. +*) + +open FSharp.Stats +open FSharp.Stats.Quantile + +let rng = Distributions.ContinuousDistribution.normal 3. 1. + +let sample = Array.init 1000 (fun _ -> rng.Sample()) + +let quantile25 = Quantile.mode 0.25 sample +let quantile50 = Quantile.mode 0.50 sample +let quantile75 = Quantile.mode 0.75 sample +let quantile100 = Quantile.mode 1.00 sample + + +[|quantile25;quantile50;quantile75;quantile100|] +(***include-it-raw***) + + +(** + +These special quantiles are also called quartiles since the divide the data into 4 sections. +Now we can divide the data into the ranges defined by the quantiles and plot them. Here the ranges defines half-open interval: + +*) + +let range25 = sample |> Array.filter (fun x -> x < quantile25) +let range50 = sample |> Array.filter (fun x -> x > quantile25 && x < quantile50) +let range75 = sample |> Array.filter (fun x -> x > quantile50 && x < quantile75) +let range100 = sample |> Array.filter (fun x -> x > quantile75) + +let quartilePlot = + [| + Chart.Histogram(range25) |> Chart.withXAxisStyle("25",MinMax=(0.,6.)) + Chart.Histogram(range50) |> Chart.withXAxisStyle("50",MinMax=(0.,6.)) + Chart.Histogram(range75) |> Chart.withXAxisStyle("75",MinMax=(0.,6.)) + Chart.Histogram(range100) |> Chart.withXAxisStyle("100",MinMax=(0.,6.)) + |] + |> Chart.Grid(4,1) + +(*** condition: ipynb ***) +#if IPYNB +quartilePlot +#endif // IPYNB + +(***hide***) +quartilePlot |> GenericChart.toChartHTML +(***include-it-raw***) + + +(** ## QQ Plot @@ -65,9 +124,6 @@ Lets create four samples of size 300 first: *) -open FSharp.Stats -open FSharp.Stats.Quantile - //create samples let rnd = System.Random() @@ -167,7 +223,8 @@ The both samples were taken from the same normal distribution and therefore they ### Comparing a sample against a normal distribution -You also can plot the quantiles from a sample versus a normal distribution. Your data is z standardized prior to quantile determination to have zero mean and unit variance. +You also can plot the quantiles from a sample versus a normal distribution to check if your data is normally distributed. +Your data is z standardized prior to quantile determination to have zero mean and unit variance. *) @@ -202,7 +259,7 @@ Its easy to see that the random smaples are distributed between 0 and 1 while th //The raw qq-plot data of a standard normal distribution and the sample distribution -let qqData sample = FSharp.Stats.Signal.QQPlot.fromSampleToGauss sample +let qqData2 sample = FSharp.Stats.Signal.QQPlot.fromSampleToGauss sample //plots QQ plot from a sample population against a standard normal distribution diff --git a/src/FSharp.Stats/Signal/QQPlot.fs b/src/FSharp.Stats/Signal/QQPlot.fs index 9bf0c54c..182d8e89 100644 --- a/src/FSharp.Stats/Signal/QQPlot.fs +++ b/src/FSharp.Stats/Signal/QQPlot.fs @@ -15,6 +15,10 @@ module QQPlot = mode quantile sampleA, mode quantile sampleB ) + + let fromTwoSamples' sampleA sampleB = + 1 + /// computes the quantile quantile coordinates of a sample distributions against a normal distribution. Uses default quantile (Quantile.mode) let fromSampleToGauss sample = let standardizedData = @@ -28,3 +32,42 @@ module QQPlot = inverseCDF quantile, Quantile.mode quantile standardizedData ) + + /// Computes the quantile quantile coordinates of a sample distributions against a normal distribution. + /// The sample is z transformed and for every data point the corresponding gauss-quantile is determined. + let fromSampleToGauss' sample = + + let sampleLength = Seq.length sample + + let standardizedData = + Signal.Normalization.zScoreTransformPopulation (vector sample) + |> Vector.toArray + + let inverseCDF x = + sqrt 2. * Errorfunction.inverf (2. * x - 1.) + + standardizedData + |> Seq.mapi (fun i zScore -> + let quantile = float (i + 1) / float sampleLength + let quantileNormal = inverseCDF quantile + quantileNormal,zScore + ) + + /// Computes the quantile quantile coordinates of a sample distributions against a normal distribution. + /// The sample is standardized between 0 and 1 and for every data point the corresponding quantile is determined. + let fromSampleToUniform sample = + + let sampleLength = Seq.length sample + let interval = Intervals.ofSeq sample + + let standardizedData = + sample + |> Seq.map (fun x -> + (x - Intervals.getEnd interval) - Intervals.getSize interval + ) + + standardizedData + |> Seq.mapi (fun i zScore -> + let quantileUniform = float (i + 1) / float sampleLength + quantileUniform,zScore + ) \ No newline at end of file From b0dc13f51f1106fab8daa4a847fe0e3ddbe2aaac Mon Sep 17 00:00:00 2001 From: bvenn Date: Wed, 16 Nov 2022 21:04:27 +0100 Subject: [PATCH 07/10] update qq plot --- src/FSharp.Stats/Signal/QQPlot.fs | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/FSharp.Stats/Signal/QQPlot.fs b/src/FSharp.Stats/Signal/QQPlot.fs index 182d8e89..a5fcb1a0 100644 --- a/src/FSharp.Stats/Signal/QQPlot.fs +++ b/src/FSharp.Stats/Signal/QQPlot.fs @@ -10,14 +10,21 @@ module QQPlot = /// computes the quantile quantile coordinates of two sample distributions. Uses default quantile (Quantile.mode) let fromTwoSamples sampleA sampleB = - [0. .. 0.01 .. 1.] - |> List.map (fun quantile -> - mode quantile sampleA, - mode quantile sampleB - ) + [0. .. 0.01 .. 1.] + |> List.map (fun quantile -> + mode quantile sampleA, + mode quantile sampleB + ) let fromTwoSamples' sampleA sampleB = - 1 + let sampleALength = Seq.length sampleA + let sampleBLength = Seq.length sampleB + let minSampleLength = min sampleALength sampleBLength + let stepwidth = 1. / float minSampleLength + [stepwidth .. stepwidth .. 1.] + + + /// computes the quantile quantile coordinates of a sample distributions against a normal distribution. Uses default quantile (Quantile.mode) let fromSampleToGauss sample = From a53ab7eac835b72448b7bcaec17891438089a89d Mon Sep 17 00:00:00 2001 From: Benedikt Venn Date: Thu, 17 Nov 2022 17:27:00 +0100 Subject: [PATCH 08/10] fix quantile normalization #238 --- docs/Quantiles.fsx | 39 ++++--- src/FSharp.Stats/Signal/QQPlot.fs | 167 +++++++++++++++++++++--------- 2 files changed, 142 insertions(+), 64 deletions(-) diff --git a/docs/Quantiles.fsx b/docs/Quantiles.fsx index 21e99736..649bc4ee 100644 --- a/docs/Quantiles.fsx +++ b/docs/Quantiles.fsx @@ -49,9 +49,9 @@ _Summary:_ this tutorial demonstrates how to handle quantiles and QQ-Plots Quantiles are values that divide data into equally spaced groups. Percentiles are just quantiles that divide the data in 100 equally sized groups. The median for example defines the 0.5 quantile or 0.5 percentile. You can calculate the quantile by calculating how many values are less than the value you are interested in. -There are many possibilities to handle ties or data that cannot be split equally. The default quantile version used in R is ´Quantile.mode´. +_Note: There are many possibilities to handle ties or data that cannot be split equally. The default quantile version used in R is `Quantile.mode`._ -Lets sample 1000 data points from a normal distribution and calculate some percentiles. +Let's sample 1000 data points from a normal distribution and calculate some percentiles. *) open FSharp.Stats @@ -83,12 +83,13 @@ let range50 = sample |> Array.filter (fun x -> x > quantile25 && x < quantile50 let range75 = sample |> Array.filter (fun x -> x > quantile50 && x < quantile75) let range100 = sample |> Array.filter (fun x -> x > quantile75) +(*** hide ***) let quartilePlot = [| - Chart.Histogram(range25) |> Chart.withXAxisStyle("25",MinMax=(0.,6.)) - Chart.Histogram(range50) |> Chart.withXAxisStyle("50",MinMax=(0.,6.)) - Chart.Histogram(range75) |> Chart.withXAxisStyle("75",MinMax=(0.,6.)) - Chart.Histogram(range100) |> Chart.withXAxisStyle("100",MinMax=(0.,6.)) + Chart.Histogram(range25,"25") |> Chart.withTemplate ChartTemplates.lightMirrored |> Chart.withXAxisStyle("",MinMax=(0.,6.)) |> Chart.withYAxisStyle("Quartil 25") + Chart.Histogram(range50,"50") |> Chart.withTemplate ChartTemplates.lightMirrored |> Chart.withXAxisStyle("",MinMax=(0.,6.)) |> Chart.withYAxisStyle("Quartil 50") + Chart.Histogram(range75,"75") |> Chart.withTemplate ChartTemplates.lightMirrored |> Chart.withXAxisStyle("",MinMax=(0.,6.)) |> Chart.withYAxisStyle("Quartil 75") + Chart.Histogram(range100,"100") |> Chart.withTemplate ChartTemplates.lightMirrored |> Chart.withXAxisStyle("",MinMax=(0.,6.)) |> Chart.withYAxisStyle("Quartil 100") |] |> Chart.Grid(4,1) @@ -111,10 +112,17 @@ QQ plots allow to compare two sample distributions if: - the underlying population distribution is unknown or if - the relationship between two distributions should be evaluated in greater detail than just their estimated parameters. +When sample is compared to a known distribution, every quantile can be calculated exactly by inverting the CDF. If you compare two samples, there is no uniquely defined CDF, so quantiles have to be interpolated. Additionally +there are various methods for determining Quantiles that differ in handling ties and uneven spacing. ### Comparing two sample distributions -The most common use case is to compare two sample populations: +Two sample populations can be compared by QQ-plots where quantiles of the first sample are plotted against quantiles of the second sample. If the sample length is equal, both samples are ordered and plotted as pairs. + +$qq_i = X_i,Y_i$ with X and Y beeing sample sequences of length n and $1 <= i <= n)$. + + +If samples sizes are unequal the quantiles have to be estimated. Note that this method does not replace a significance test wether the distributions differ statistically. Lets create four samples of size 300 first: @@ -165,10 +173,10 @@ Lets calculate the quantiles from _normalDistA_ vs _normalDistB_ *) // Here a tuple sequence is generated that pairwise contain the same quantiles from normalDistA and normalDistB -let qqData = Signal.QQPlot.fromTwoSamples normalDistA normalDistB +let qqData = Signal.QQPlot.fromTwoSamples() normalDistA normalDistB // Lets check out the first 5 elements in the sequence -qqData.[0..4] +Seq.head qqData (***include-it-raw***) (** @@ -177,12 +185,15 @@ You can use this tuple sequence and plot it against each other. The diagonal lin *) +open FSharp.Stats.Signal +open FSharp.Stats.Signal.QQPlot + //plots QQ plot from two sample populations let plotFrom2Populations sampleA sampleB = //this is the main data plotted as x,y diagram let qqData = - Signal.QQPlot.fromTwoSamples sampleA sampleB + QQPlot.fromTwoSamples() sampleA sampleB //for a perfect match, all points should be located on the bisector let expectedLine = @@ -259,20 +270,20 @@ Its easy to see that the random smaples are distributed between 0 and 1 while th //The raw qq-plot data of a standard normal distribution and the sample distribution -let qqData2 sample = FSharp.Stats.Signal.QQPlot.fromSampleToGauss sample +let qq2Normal sample = QQPlot.fromSampleToGauss(Method=QuantileMethod.Rankit,ZTransform=false) sample //plots QQ plot from a sample population against a standard normal distribution let plotFromOneSampleGauss sample = //this is the main data plotted as x,y diagram - let qqData = FSharp.Stats.Signal.QQPlot.fromSampleToGauss sample + let qqData = QQPlot.fromSampleToGauss(Method=QuantileMethod.Rankit,ZTransform=false) sample let qqChart = Chart.Point qqData let expectedLine = - let minimum = snd qqData.[0] + let minimum = qqData |> Seq.head |> snd let maximum = qqData |> Seq.last |> snd [ minimum,minimum @@ -286,7 +297,7 @@ let plotFromOneSampleGauss sample = expectedLine ] |> Chart.combine - |> Chart.withXAxisStyle "Quantiles sample" + |> Chart.withXAxisStyle "Theoretical quantiles" |> Chart.withYAxisStyle "Quantiles gauss" |> Chart.withTemplate ChartTemplates.lightMirrored diff --git a/src/FSharp.Stats/Signal/QQPlot.fs b/src/FSharp.Stats/Signal/QQPlot.fs index a5fcb1a0..f9c7fc42 100644 --- a/src/FSharp.Stats/Signal/QQPlot.fs +++ b/src/FSharp.Stats/Signal/QQPlot.fs @@ -7,74 +7,141 @@ open FSharp.Stats.SpecialFunctions open FSharp.Stats.Quantile module QQPlot = - - /// computes the quantile quantile coordinates of two sample distributions. Uses default quantile (Quantile.mode) - let fromTwoSamples sampleA sampleB = - [0. .. 0.01 .. 1.] - |> List.map (fun quantile -> - mode quantile sampleA, - mode quantile sampleB - ) - let fromTwoSamples' sampleA sampleB = + type QuantileMethod = + | Blom + | Rankit + | Tukey + | VanDerWerden + + /// computes the quantile quantile coordinates of two sample distributions. Uses default quantile (Quantile.mode) + // tested against R qqnorm + let internal fromTwoSamples method sampleA sampleB = let sampleALength = Seq.length sampleA let sampleBLength = Seq.length sampleB - let minSampleLength = min sampleALength sampleBLength - let stepwidth = 1. / float minSampleLength - [stepwidth .. stepwidth .. 1.] + let minSampleLength = float (min sampleALength sampleBLength) + + let (smallSet,bigSet) = if sampleALength <= sampleBLength then (sampleA,sampleB) else (sampleB,sampleA) + + let getQuantile rank = + match method with + | Blom -> (rank - 3. / 8.) / (minSampleLength + 1. / 4.) + | Rankit -> (rank - 1. / 2.) / minSampleLength + | Tukey -> (rank - 1. / 3.) / (minSampleLength + 1. / 3.) + | VanDerWerden -> rank / (minSampleLength + 1.) + + smallSet + |> Seq.sort + |> Seq.mapi (fun i x -> + let rank = float (i + 1) + let pi = getQuantile rank + x,Quantile.mode pi bigSet + ) + + + /// Computes the quantile quantile coordinates of a sample distributions against a normal distribution. + /// The sample can be z transformed. StandardMethod = Rankit + // tested against R qqnorm + let internal fromSampleToGauss method zTransformSample (sample:seq) = - + let sampleLength = Seq.length sample |> float - - /// computes the quantile quantile coordinates of a sample distributions against a normal distribution. Uses default quantile (Quantile.mode) - let fromSampleToGauss sample = let standardizedData = - Signal.Normalization.zScoreTransformPopulation (vector sample) + if zTransformSample then + Signal.Normalization.zScoreTransformPopulation (vector sample) + |> Vector.toArray + else + sample |> Seq.toArray + + let getQuantile rank = + match method with + | Blom -> (rank - 3. / 8.) / (sampleLength + 1. / 4.) + | Rankit -> (rank - 1. / 2.) / sampleLength + | Tukey -> (rank - 1. / 3.) / (sampleLength + 1. / 3.) + | VanDerWerden -> rank / (sampleLength + 1.) let inverseCDF x = sqrt 2. * Errorfunction.inverf (2. * x - 1.) - [0. .. 0.01 .. 1.] - |> List.map (fun quantile -> - inverseCDF quantile, - Quantile.mode quantile standardizedData + standardizedData + |> Seq.sort + |> Seq.mapi (fun i x -> + let rank = float (i + 1) + let pi = getQuantile rank + inverseCDF pi,x ) /// Computes the quantile quantile coordinates of a sample distributions against a normal distribution. - /// The sample is z transformed and for every data point the corresponding gauss-quantile is determined. - let fromSampleToGauss' sample = + /// The sample can be standardized between 0 and 1. + let internal fromSampleToUniform method standardizeSample (sample:seq) = + + let sampleLength = Seq.length sample |> float + + let standardizedSample = + if standardizeSample then + let min = Seq.min sample + let max = Seq.max sample + sample |> Seq.map (fun x -> (x-min) / (max-min)) + else + sample + + let getQuantile rank = + match method with + | Blom -> (rank - 3. / 8.) / (sampleLength + 1. / 4.) + | Rankit -> (rank - 1. / 2.) / sampleLength + | Tukey -> (rank - 1. / 3.) / (sampleLength + 1. / 3.) + | VanDerWerden -> rank / (sampleLength + 1.) + + standardizedSample + |> Seq.sort + |> Seq.mapi (fun i x -> + let rank = float (i + 1) + let pi = getQuantile rank + pi,x + ) - let sampleLength = Seq.length sample - let standardizedData = - Signal.Normalization.zScoreTransformPopulation (vector sample) - |> Vector.toArray +type QQPlot() = - let inverseCDF x = - sqrt 2. * Errorfunction.inverf (2. * x - 1.) + + + /// Computes the quantile quantile coordinates of two sample distributions. + /// Uses default quantile (Quantile.mode) and Rankit method + static member fromTwoSamples(?Method:QQPlot.QuantileMethod) = + + let method = defaultArg Method QQPlot.QuantileMethod.Rankit + + fun (sampleA: seq) (sampleB: seq) -> + QQPlot.fromTwoSamples method sampleA sampleB + - standardizedData - |> Seq.mapi (fun i zScore -> - let quantile = float (i + 1) / float sampleLength - let quantileNormal = inverseCDF quantile - quantileNormal,zScore - ) /// Computes the quantile quantile coordinates of a sample distributions against a normal distribution. - /// The sample is standardized between 0 and 1 and for every data point the corresponding quantile is determined. - let fromSampleToUniform sample = + /// The sample can be z transformed. default = Rankit + static member fromSampleToGauss(?Method:QQPlot.QuantileMethod,?ZTransform:bool) = + + let standardize = defaultArg ZTransform false + let method = defaultArg Method QQPlot.QuantileMethod.Rankit + + fun (sample: seq) -> + QQPlot.fromSampleToGauss method standardize sample + + + + + /// Computes the quantile quantile coordinates of a sample distributions against a normal distribution. + /// The sample can be standardized to the range between 0 and 1. default = Rankit + static member fromSampleToUniform(?Method:QQPlot.QuantileMethod,?Standardize:bool) = + + let standardize = defaultArg Standardize false + let method = defaultArg Method QQPlot.QuantileMethod.Rankit + + fun (sample: seq) -> + QQPlot.fromSampleToUniform method standardize sample + + + + + - let sampleLength = Seq.length sample - let interval = Intervals.ofSeq sample - let standardizedData = - sample - |> Seq.map (fun x -> - (x - Intervals.getEnd interval) - Intervals.getSize interval - ) - - standardizedData - |> Seq.mapi (fun i zScore -> - let quantileUniform = float (i + 1) / float sampleLength - quantileUniform,zScore - ) \ No newline at end of file From 20b2416d678bfd411b55288316a163e345517323 Mon Sep 17 00:00:00 2001 From: Benedikt Venn Date: Fri, 18 Nov 2022 15:59:36 +0100 Subject: [PATCH 09/10] update qq plot from two samples --- src/FSharp.Stats/Signal/QQPlot.fs | 103 ++++++++++++++++++++---------- 1 file changed, 71 insertions(+), 32 deletions(-) diff --git a/src/FSharp.Stats/Signal/QQPlot.fs b/src/FSharp.Stats/Signal/QQPlot.fs index f9c7fc42..b2d43b40 100644 --- a/src/FSharp.Stats/Signal/QQPlot.fs +++ b/src/FSharp.Stats/Signal/QQPlot.fs @@ -5,6 +5,7 @@ open FSharp.Stats open System open FSharp.Stats.SpecialFunctions open FSharp.Stats.Quantile +open FSharp.Stats.Interpolation module QQPlot = @@ -16,27 +17,54 @@ module QQPlot = /// computes the quantile quantile coordinates of two sample distributions. Uses default quantile (Quantile.mode) // tested against R qqnorm - let internal fromTwoSamples method sampleA sampleB = + //let internal fromTwoSamples method sampleA sampleB = + // let sampleALength = Seq.length sampleA + // let sampleBLength = Seq.length sampleB + // let minSampleLength = float (min sampleALength sampleBLength) + // + // let (smallSet,bigSet) = if sampleALength <= sampleBLength then (sampleA,sampleB) else (sampleB,sampleA) + // + // let getQuantile rank = + // match method with + // | Blom -> (rank - 3. / 8.) / (minSampleLength + 1. / 4.) + // | Rankit -> (rank - 1. / 2.) / minSampleLength + // | Tukey -> (rank - 1. / 3.) / (minSampleLength + 1. / 3.) + // | VanDerWerden -> rank / (minSampleLength + 1.) + // + // smallSet + // |> Seq.sort + // |> Seq.mapi (fun i x -> + // let rank = float (i + 1) + // let pi = getQuantile rank + // x,Quantile.mode pi bigSet + // ) + + /// Computes the quantile quantile coordinates of two sample distributions. + /// If samples are not the same size, the larger samples is interpolated to match the quantiles of the smaller. + // tested against R qqplot + let fromTwoSamples sampleA sampleB = let sampleALength = Seq.length sampleA let sampleBLength = Seq.length sampleB - let minSampleLength = float (min sampleALength sampleBLength) - - let (smallSet,bigSet) = if sampleALength <= sampleBLength then (sampleA,sampleB) else (sampleB,sampleA) - - let getQuantile rank = - match method with - | Blom -> (rank - 3. / 8.) / (minSampleLength + 1. / 4.) - | Rankit -> (rank - 1. / 2.) / minSampleLength - | Tukey -> (rank - 1. / 3.) / (minSampleLength + 1. / 3.) - | VanDerWerden -> rank / (minSampleLength + 1.) - smallSet - |> Seq.sort - |> Seq.mapi (fun i x -> - let rank = float (i + 1) - let pi = getQuantile rank - x,Quantile.mode pi bigSet + let ((sN,smallSet),(bN,bigSet)) = + if sampleALength <= sampleBLength then + ((sampleALength,Seq.sort sampleA),(sampleBLength,Seq.sort sampleB)) + else ((sampleBLength,Seq.sort sampleB),(sampleALength,Seq.sort sampleA)) + + let linearSpl = LinearSpline.initInterpolate [|0. .. float bN - 1.|] (Array.ofSeq bigSet) + let stepwidth = float (bN-1) / float (sN-1) + + let approxbigSet = + Array.init sN (fun i -> + let xV = (float i) * stepwidth + let yV = LinearSpline.interpolate linearSpl xV + yV ) + + if sampleALength <= sampleBLength then + Seq.zip smallSet approxbigSet + else + Seq.zip approxbigSet smallSet /// Computes the quantile quantile coordinates of a sample distributions against a normal distribution. @@ -100,38 +128,42 @@ module QQPlot = pi,x ) + /// Computes the quantile quantile coordinates of a sample distributions against a specified inverseCDF function. You can derive an inverse CDF of any statistical distribution. + let internal fromSampleToInverseCDF method invCDF (sample:seq) = -type QQPlot() = - - + let sampleLength = Seq.length sample |> float - /// Computes the quantile quantile coordinates of two sample distributions. - /// Uses default quantile (Quantile.mode) and Rankit method - static member fromTwoSamples(?Method:QQPlot.QuantileMethod) = + let getQuantile rank = + match method with + | Blom -> (rank - 3. / 8.) / (sampleLength + 1. / 4.) + | Rankit -> (rank - 1. / 2.) / sampleLength + | Tukey -> (rank - 1. / 3.) / (sampleLength + 1. / 3.) + | VanDerWerden -> rank / (sampleLength + 1.) - let method = defaultArg Method QQPlot.QuantileMethod.Rankit + sample + |> Seq.sort + |> Seq.mapi (fun i x -> + let rank = float (i + 1) + let pi = getQuantile rank + invCDF pi,x + ) - fun (sampleA: seq) (sampleB: seq) -> - QQPlot.fromTwoSamples method sampleA sampleB - +type QQPlot() = /// Computes the quantile quantile coordinates of a sample distributions against a normal distribution. /// The sample can be z transformed. default = Rankit - static member fromSampleToGauss(?Method:QQPlot.QuantileMethod,?ZTransform:bool) = + static member toGauss(?Method:QQPlot.QuantileMethod,?ZTransform:bool) = let standardize = defaultArg ZTransform false let method = defaultArg Method QQPlot.QuantileMethod.Rankit fun (sample: seq) -> QQPlot.fromSampleToGauss method standardize sample - - - /// Computes the quantile quantile coordinates of a sample distributions against a normal distribution. /// The sample can be standardized to the range between 0 and 1. default = Rankit - static member fromSampleToUniform(?Method:QQPlot.QuantileMethod,?Standardize:bool) = + static member toUniform(?Method:QQPlot.QuantileMethod,?Standardize:bool) = let standardize = defaultArg Standardize false let method = defaultArg Method QQPlot.QuantileMethod.Rankit @@ -139,6 +171,13 @@ type QQPlot() = fun (sample: seq) -> QQPlot.fromSampleToUniform method standardize sample + /// Computes the quantile quantile coordinates of a sample distributions against a specified inverseCDF function. You can derive an inverse CDF of any statistical distribution. + static member toInvCDF(inverseCDF:(float -> float),?Method:QQPlot.QuantileMethod) = + + let method = defaultArg Method QQPlot.QuantileMethod.Rankit + + fun (sample: seq) -> + QQPlot.fromSampleToInverseCDF method inverseCDF sample From 26818d6b4d7483d2a7357ae23a46e746a88969f0 Mon Sep 17 00:00:00 2001 From: Benedikt Venn Date: Fri, 18 Nov 2022 15:59:43 +0100 Subject: [PATCH 10/10] update qq plot documentation --- docs/ML.fsx | 2 +- docs/Quantiles.fsx | 220 +++++++++++++++++++++++++++++++-------------- 2 files changed, 156 insertions(+), 66 deletions(-) diff --git a/docs/ML.fsx b/docs/ML.fsx index 9297df9f..b866206b 100644 --- a/docs/ML.fsx +++ b/docs/ML.fsx @@ -46,7 +46,7 @@ _Summary:_ this tutorial demonstrates functionality relevant in the context of m ### Table of contents - - [Dimensionality Reduction](#Dimensionality Reduction) + - [Dimensionality Reduction](#Dimensionality-Reduction) - [PCA](#PCA) ## Dimensionality Reduction diff --git a/docs/Quantiles.fsx b/docs/Quantiles.fsx index 649bc4ee..c168327d 100644 --- a/docs/Quantiles.fsx +++ b/docs/Quantiles.fsx @@ -44,18 +44,27 @@ module Chart = _Summary:_ this tutorial demonstrates how to handle quantiles and QQ-Plots +### Table of contents + + - [Quantiles](#Quantiles) + - [QQ plot](#QQ-plot) + - [Comparing two sample distributions](#Comparing-two-sample-distributions) + - [Comparing a sample against a distribution](#Comparing-a-sample-against-a-distribution) + - [Normal distribution](#Normal-distribution) + - [Uniform Distribution](#Uniform-Distribution) + ## Quantiles Quantiles are values that divide data into equally spaced groups. Percentiles are just quantiles that divide the data in 100 equally sized groups. -The median for example defines the 0.5 quantile or 0.5 percentile. You can calculate the quantile by calculating how many values are less than the value you are interested in. +The median for example defines the 0.5 quantile or 0.5 percentile. You can calculate the quantile by what proportion of values are less than the value you are interested in. -_Note: There are many possibilities to handle ties or data that cannot be split equally. The default quantile version used in R is `Quantile.mode`._ +_Note: There are many possibilities to handle ties or data that cannot be split equally. The default quantile method used here is `Quantile.mode`._ Let's sample 1000 data points from a normal distribution and calculate some percentiles. *) - +open System open FSharp.Stats -open FSharp.Stats.Quantile +open FSharp.Stats.Signal let rng = Distributions.ContinuousDistribution.normal 3. 1. @@ -74,7 +83,7 @@ let quantile100 = Quantile.mode 1.00 sample (** These special quantiles are also called quartiles since the divide the data into 4 sections. -Now we can divide the data into the ranges defined by the quantiles and plot them. Here the ranges defines half-open interval: +Now we can divide the data into the ranges defined by the quantiles and plot them. Here the ranges defines half-open intervals: *) @@ -86,13 +95,14 @@ let range100 = sample |> Array.filter (fun x -> x > quantile75) (*** hide ***) let quartilePlot = [| - Chart.Histogram(range25,"25") |> Chart.withTemplate ChartTemplates.lightMirrored |> Chart.withXAxisStyle("",MinMax=(0.,6.)) |> Chart.withYAxisStyle("Quartil 25") - Chart.Histogram(range50,"50") |> Chart.withTemplate ChartTemplates.lightMirrored |> Chart.withXAxisStyle("",MinMax=(0.,6.)) |> Chart.withYAxisStyle("Quartil 50") - Chart.Histogram(range75,"75") |> Chart.withTemplate ChartTemplates.lightMirrored |> Chart.withXAxisStyle("",MinMax=(0.,6.)) |> Chart.withYAxisStyle("Quartil 75") - Chart.Histogram(range100,"100") |> Chart.withTemplate ChartTemplates.lightMirrored |> Chart.withXAxisStyle("",MinMax=(0.,6.)) |> Chart.withYAxisStyle("Quartil 100") + Chart.Histogram(range25,"25",ShowLegend=false) |> Chart.withTemplate ChartTemplates.lightMirrored |> Chart.withXAxisStyle("",MinMax=(0.,6.)) |> Chart.withYAxisStyle("Quartil 25") + Chart.Histogram(range50,"50",ShowLegend=false) |> Chart.withTemplate ChartTemplates.lightMirrored |> Chart.withXAxisStyle("",MinMax=(0.,6.)) |> Chart.withYAxisStyle("Quartil 50") + Chart.Histogram(range75,"75",ShowLegend=false) |> Chart.withTemplate ChartTemplates.lightMirrored |> Chart.withXAxisStyle("",MinMax=(0.,6.)) |> Chart.withYAxisStyle("Quartil 75") + Chart.Histogram(range100,"100",ShowLegend=false) |> Chart.withTemplate ChartTemplates.lightMirrored |> Chart.withXAxisStyle("",MinMax=(0.,6.)) |> Chart.withYAxisStyle("Quartil 100") |] |> Chart.Grid(4,1) + (*** condition: ipynb ***) #if IPYNB quartilePlot @@ -105,39 +115,39 @@ quartilePlot |> GenericChart.toChartHTML (** -## QQ Plot +## QQ plot -QQ plots allow to compare two sample distributions if: +QQ plots allow to compare sample distributions if: - the underlying population distribution is unknown or if - the relationship between two distributions should be evaluated in greater detail than just their estimated parameters. -When sample is compared to a known distribution, every quantile can be calculated exactly by inverting the CDF. If you compare two samples, there is no uniquely defined CDF, so quantiles have to be interpolated. Additionally -there are various methods for determining Quantiles that differ in handling ties and uneven spacing. +When a sample is compared to a known distribution, every quantile can be calculated exactly by inverting their CDF. If you compare two samples, there is no uniquely defined CDF, +so quantiles have to be interpolated. ### Comparing two sample distributions Two sample populations can be compared by QQ-plots where quantiles of the first sample are plotted against quantiles of the second sample. If the sample length is equal, both samples are ordered and plotted as pairs. -$qq_i = X_i,Y_i$ with X and Y beeing sample sequences of length n and $1 <= i <= n)$. +$qq_i = X_i,Y_i$ with $X$ and $Y$ beeing ordered sample sequences of length $n$ and $(1 \le i \le n)$ -If samples sizes are unequal the quantiles have to be estimated. Note that this method does not replace a significance test wether the distributions differ statistically. +If samples sizes are unequal the quantiles of the larger data set have to be interpolated from the quantiles of the smaller data set. -Lets create four samples of size 300 first: +**Lets create four samples of size 300 first:** - - two that are drawn from a normal distribution + - two that are drawn from a normal distribution of mean $3.0$ and standard deviation $0.5$ - - two that are drawn randomly between 0 and 1 + - two that are drawn randomly between 0 and 1 *) //create samples let rnd = System.Random() -let norm = Distributions.ContinuousDistribution.normal 0. 1. +let norm = Distributions.ContinuousDistribution.normal 3.0 0.5 -///Example 1: Aamples from a standard normal distribution +///Example 1: Aamples from a normal distribution let normalDistA = Array.init 300 (fun _ -> norm.Sample()) let normalDistB = Array.init 300 (fun _ -> norm.Sample()) @@ -166,76 +176,59 @@ exampleDistributions |> GenericChart.toChartHTML (** -To compare if two distributions are equal or to identify ranges in which the distributions differ the 100 quantiles from each of the two distributions can be calculated and plotted against each other. -If both distributions are similar, you would expect the quantiles to be identical and therefore are located on the bisector of the QQ-Plot. +To compare if two distributions are equal or to identify ranges in which the distributions differ, a quantile pair from each of the two distributions can be calculated and plotted against each other. +If both distributions are similar, you would expect the quantiles to be identical and therefore are located on a straight line. If the samples are of different length $m$ and $n$ the number +of quantiles is limited to $min$ $m$ $n$. For every data point of the smaller data set a corresponding quantile of the larger data set is determined. -Lets calculate the quantiles from _normalDistA_ vs _normalDistB_ +Lets calculate the quantiles from _normalDistA_ vs _normalDistB_. *) // Here a tuple sequence is generated that pairwise contain the same quantiles from normalDistA and normalDistB -let qqData = Signal.QQPlot.fromTwoSamples() normalDistA normalDistB +let qqData = QQPlot.fromTwoSamples normalDistA normalDistB // Lets check out the first 5 elements in the sequence -Seq.head qqData +Seq.take 5 qqData (***include-it-raw***) (** -You can use this tuple sequence and plot it against each other. The diagonal line indicates the bisector where perfect matches would be located. +You can use this tuple sequence and plot it against each other. *) open FSharp.Stats.Signal open FSharp.Stats.Signal.QQPlot + //plots QQ plot from two sample populations let plotFrom2Populations sampleA sampleB = - //this is the main data plotted as x,y diagram - let qqData = - QQPlot.fromTwoSamples() sampleA sampleB - - //for a perfect match, all points should be located on the bisector - let expectedLine = - let minimum = min (Quantile.mode 0. sampleA) (Quantile.mode 0. sampleB) - let maximum = max (Quantile.mode 1. sampleA) (Quantile.mode 1. sampleB) - [ - minimum,minimum - maximum,maximum - ] - |> Chart.Line - |> Chart.withTraceName "expected" + //here the coordinates are calculated + let qqCoordinates = QQPlot.fromTwoSamples sampleA sampleB - [ - Chart.Point (qqData,Name="QQ") - expectedLine - ] - |> Chart.combine + Chart.Point (qqCoordinates,Name="QQ") |> Chart.withXAxisStyle "Quantiles sample A" |> Chart.withYAxisStyle "Quantiles sample B" |> Chart.withTemplate ChartTemplates.lightMirrored -let myQQPlot = plotFrom2Populations normalDistA normalDistB +let myQQplot1 = plotFrom2Populations normalDistA normalDistB (*** condition: ipynb ***) #if IPYNB -myQQPlot +myQQplot1 #endif // IPYNB (***hide***) -myQQPlot |> GenericChart.toChartHTML +myQQplot1 |> GenericChart.toChartHTML (***include-it-raw***) (** -The both samples were taken from the same normal distribution and therefore they match pretty well. +Both samples were taken from the same distribution (here normal distribution) and therefore they match pretty well. -### Comparing a sample against a normal distribution - -You also can plot the quantiles from a sample versus a normal distribution to check if your data is normally distributed. -Your data is z standardized prior to quantile determination to have zero mean and unit variance. +In the following plot you can see four comparisons of the four distributions defined in the beginning (2x normal + 2x uniform). *) @@ -262,22 +255,44 @@ multipleQQPlots |> GenericChart.toChartHTML (** -When QQ-plots are generated for pairwise comparisons, it is obvious, that the random-random and normal-normal samples fit nicely. The cross comparisons between normal and random samples do not match. -Its easy to see that the random smaples are distributed between 0 and 1 while the samples from the normal distributions range from ~-2 to ~2 +When QQ-plots are generated for pairwise comparisons, it is obvious, that the _random_-_random_ and _normal_-_normal_ samples fit nicely. The cross comparisons between normal and random samples do not match. +Its easy to see that the random samples are distributed between 0 and 1 while the samples from the normal distributions range from $1$ to ~$5$. + + +### Comparing a sample against a distribution + +You can plot the quantiles from a sample versus a known distribution to check if your data follows the given distribution. +There are various methods to determine quantiles that differ in handling ties and uneven spacing. -*) +``` +Quantile determination methods(rank,sampleLength): + - Blom -> (rank - 3. / 8.) / (sampleLength + 1. / 4.) + - Rankit -> (rank - 1. / 2.) / sampleLength + - Tukey -> (rank - 1. / 3.) / (sampleLength + 1. / 3.) + - VanDerWerden -> rank / (sampleLength + 1.) +``` +_Note that this method does not replace a significance test wether the distributions differ statistically._ -//The raw qq-plot data of a standard normal distribution and the sample distribution -let qq2Normal sample = QQPlot.fromSampleToGauss(Method=QuantileMethod.Rankit,ZTransform=false) sample +#### Normal distribution +The data can be z standardized prior to quantile determination to have zero mean and unit variance. If the data is zTransformed the bisector defines a perfect match. -//plots QQ plot from a sample population against a standard normal distribution +*) + +// The raw qq-plot data of a standard normal distribution and the sample distribution +// defaults: +// Method: QuantileMethod.Rankit +// ZTransform: false +let qq2Normal sample = QQPlot.toGauss(Method=QuantileMethod.Rankit,ZTransform=true) sample + +// plots QQ plot from a sample population against a standard normal distribution. +// if the data is zTransformed the bisector defines a perfect match. let plotFromOneSampleGauss sample = //this is the main data plotted as x,y diagram - let qqData = QQPlot.fromSampleToGauss(Method=QuantileMethod.Rankit,ZTransform=false) sample + let qqData = QQPlot.toGauss(Method=QuantileMethod.Rankit,ZTransform=true) sample let qqChart = Chart.Point qqData @@ -297,8 +312,8 @@ let plotFromOneSampleGauss sample = expectedLine ] |> Chart.combine - |> Chart.withXAxisStyle "Theoretical quantiles" - |> Chart.withYAxisStyle "Quantiles gauss" + |> Chart.withXAxisStyle "Theoretical quantiles (normal)" + |> Chart.withYAxisStyle "Sample quantiles" |> Chart.withTemplate ChartTemplates.lightMirrored @@ -321,15 +336,17 @@ As seen above the sample perfectly matches the expected quantiles from a normal *) -let myQQPlotOneSampleRandm = plotFromOneSampleGauss evenRandomA +// compare the uniform sample against a normal distribution +let my2QQPlotOneSampleGauss = plotFromOneSampleGauss evenRandomA + (*** condition: ipynb ***) #if IPYNB -myQQPlotOneSampleRandm +my2QQPlotOneSampleGauss #endif // IPYNB (***hide***) -myQQPlotOneSampleRandm |> GenericChart.toChartHTML +my2QQPlotOneSampleGauss |> GenericChart.toChartHTML (***include-it-raw***) @@ -337,4 +354,77 @@ myQQPlotOneSampleRandm |> GenericChart.toChartHTML As seen above the sample does not matches the expected quantiles from a normal distribution. The sample derives from an random sampling between 0 and 1 and therefore is overrepresented in the tails. + +#### Uniform Distribution + +You also can plot your data against a uniform distribution. Data can be standardized to lie between $0$ and $1$ +*) + +let uniform = + QQPlot.toUniform(Method=QuantileMethod.Rankit,Standardize=false) normalDistA + |> Chart.Point + |> Chart.withXAxisStyle "Theoretical quantiles (uniform)" + |> Chart.withYAxisStyle "Sample quantiles" + |> Chart.withTemplate ChartTemplates.lightMirrored + +(*** condition: ipynb ***) +#if IPYNB +uniform +#endif // IPYNB + +(***hide***) +uniform |> GenericChart.toChartHTML +(***include-it-raw***) + +(** + +#### Any specified distribution + +You also can plot your data against a distribution you can specify. You have to define the _inverse CDF_ or also called the _Quantile function_. + +**LogNormal distribution** + +*) + +// generate a sample from a lognormal distriution +let sampleFromLogNormal = + let d = Distributions.ContinuousDistribution.logNormal 0. 1. + Array.init 500 (fun _ -> d.Sample()) + + + +// define the quantile function for the log normal distribution with parameters mu = 0 and sigma = 1 +let quantileFunctionLogNormal p = + let mu = 0. + let sigma = 1. + Math.Exp (mu + Math.Sqrt(2. * (pown sigma 2)) * SpecialFunctions.Errorfunction.inverf(2. * p - 1.)) + +let logNormalNormalDist = QQPlot.toInvCDF(quantileFunctionLogNormal,Method=QuantileMethod.Rankit) normalDistA + +let logNormalLogNormal = QQPlot.toInvCDF(quantileFunctionLogNormal,Method=QuantileMethod.Rankit) sampleFromLogNormal + +let logNormalChart = + [ + Chart.Point(logNormalNormalDist,Name="normal sample") + Chart.Point(logNormalLogNormal,Name="log normal sample") + ] + |> Chart.combine + |> Chart.withXAxisStyle "Theoretical quantiles Log Normal" + |> Chart.withYAxisStyle "Sample quantiles" + |> Chart.withTemplate ChartTemplates.lightMirrored + +(*** condition: ipynb ***) +#if IPYNB +logNormalChart +#endif // IPYNB + +(***hide***) +logNormalChart |> GenericChart.toChartHTML +(***include-it-raw***) + + +(** + +The log normal sample fits nicely to the bisector, but the sample from the normal distribution does not fit + *) \ No newline at end of file