-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathedpelt.go
145 lines (123 loc) · 5.71 KB
/
edpelt.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
// Copyright 2020 Gregory Petrosyan <[email protected]>
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package changepoint
import (
"math"
"sort"
)
// NonParametric returns indexes of elements that split data into
// "statistically homogeneous" segments. NonParametric supports
// nonparametric distributions and has O(N*log(N)) algorithmic complexity.
// NonParametric uses ED-PELT algorithm for changepoint detection.
//
// The implementation is based on the following papers:
//
// [Haynes2017] Kaylea Haynes, Paul Fearnhead, and Idris A. Eckley.
// "A computationally efficient nonparametric approach for changepoint detection."
// Statistics and Computing 27, no. 5 (2017): 1293-1305.
// https://doi.org/10.1007/s11222-016-9687-5
//
// [Killick2012] Rebecca Killick, Paul Fearnhead, and Idris A. Eckley.
// "Optimal detection of changepoints with a linear computational cost."
// Journal of the American Statistical Association 107, no. 500 (2012): 1590-1598.
// https://arxiv.org/pdf/1101.1438.pdf
func NonParametric(data []float64, minSegment int) []int {
if minSegment < 1 {
panic("minSegment must be positive")
}
n := len(data)
if n <= 2 || n/2 < minSegment {
return nil
}
// The penalty which we add to the final cost for each additional changepoint
// Here we use the Modified Bayesian Information Criterion
penalty := 3 * math.Log(float64(n))
// `k` is the number of quantiles that we use to approximate an integral during the segment cost evaluation
// We use `k=Ceiling(4*log(n))` as suggested in the Section 4.3 "Choice of K in ED-PELT" in [Haynes2017]
// `k` can't be greater than `n`, so we should always use the `Min` function here (important for n <= 8)
k := int(math.Min(float64(n), math.Ceil(4*math.Log(float64(n)))))
// We should precalculate sums for empirical CDF, it will allow fast evaluating of the segment cost
partialSums := edPartialSums(data, k)
cost := func(tau1 int, tau2 int) float64 {
return edCost(n, k, partialSums, tau1, tau2)
}
return pelt(data, minSegment, cost, penalty)
}
// Partial sums for empirical CDF (formula (2.1) from Section 2.1 "Model" in [Haynes2017])
//
// partialSums'[i, tau] = (count(data[j] < t) * 2 + count(data[j] == t) * 1) for j=0..tau-1
// where t is the i-th quantile value (see Section 3.1 "Discrete approximation" in [Haynes2017] for details)
//
// In order to get better performance, we present
// a two-dimensional array partialSums'[k, n + 1] as a single-dimensional array partialSums[k * (n + 1)].
// We assume that partialSums'[i, tau] = partialSums[i * (n + 1) + tau].
//
// - We use doubled sum values in order to use []int instead of []float64 (it provides noticeable
// performance boost). Thus, multipliers for count(data[j] < t) and count(data[j] == t) are
// 2 and 1 instead of 1 and 0.5 from the [Haynes2017].
// - Note that these quantiles are not uniformly distributed: tails of the data distribution contain more
// quantile values than the center of the distribution
func edPartialSums(data []float64, k int) []int32 {
n := len(data)
partialSums := make([]int32, k*(n+1))
sortedData := append([]float64(nil), data...)
sort.Float64s(sortedData)
offset := 0
for i := 0; i < k; i++ {
z := -1 + (2*float64(i)+1)/float64(k) // Values from (-1+1/k) to (1-1/k) with step = 2/k
p := 1 / (1 + math.Pow(2*float64(n)-1, -z)) // Values from 0.0 to 1.0
t := sortedData[int(p*float64(n-1))] // Quantile value, formula (2.1) in [Haynes2017]
for j, val := range data {
delta := int32(0)
if val < t {
delta = 2 // We use doubled value (2) instead of original 1.0
} else if val == t {
delta = 1 // We use doubled value (1) instead of original 0.5
}
partialSums[offset+j+1] = partialSums[offset+j] + delta
}
offset += n + 1
}
return partialSums
}
func edCost(n int, k int, partialSums []int32, tau1 int, tau2 int) float64 {
tauDiff2i := int32(2 * (tau2 - tau1))
tauDiff2f := float64(2 * (tau2 - tau1))
tauDiff2fLog := math.Log(tauDiff2f)
sum := 0.0
maxOffset := k * (n + 1)
for offset := 0; offset < maxOffset; offset += n + 1 {
// actualSum is (count(data[j] < t) * 2 + count(data[j] == t) * 1) for j=tau1..tau2-1
actualSum := partialSums[offset+tau2] - partialSums[offset+tau1] // partialSums'[i, tau2] - partialSums'[i, tau1]
if actualSum == 0 || actualSum == tauDiff2i {
continue // We skip these two cases (correspond to fit = 0 or fit = 1) because of invalid math.Log values
}
// Empirical CDF F_i(t) (Section 2.1 "Model" in [Haynes2017]):
// fit = actualSum / (2 * (tau2 - tau1))
// Segment cost L_np (Section 2.2 "Nonparametric maximum likelihood" in [Haynes2017])
// lnp = (tau2 - tau1) * (fit*log(fit) + (1-fit)*log(1-fit))
// In the end, we multiply sum by 2 (Section 3.1 "Discrete approximation" in [Haynes2017]):
// return 2 * c / k * sum
//
// Substituting fit into lnp, and transforming log(x/y) into log(x) - log(y),
// we can get rid of extra multiplications and divisions:
actualSumF := float64(actualSum)
actualSumFSub := tauDiff2f - actualSumF
sum += actualSumF * (math.Log(actualSumF) - tauDiff2fLog)
sum += actualSumFSub * (math.Log(actualSumFSub) - tauDiff2fLog)
}
// Constant from Lemma 3.1 in [Haynes2017]
c := -math.Log(2*float64(n) - 1)
return c / float64(k) * sum
}