Skip to content

Commit 5f6389a

Browse files
committed
Update: dof for chi-square test
1 parent 54a6aaf commit 5f6389a

File tree

3 files changed

+13
-5
lines changed

3 files changed

+13
-5
lines changed

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setuptools.setup(
77
name="parallel-principal-feature-analysis",
8-
version="1.0.3",
8+
version="1.0.4",
99
author="Tim Breitenbach & Lauritz Rasbach",
1010
1111
description="The first package for (parallel) Principal Feature Analysis",

src/parallel_principal_feature_analysis/find_relevant_principal_features.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,8 +102,12 @@ def find_relevant_principal_features(data,number_output_functions,cluster_size,a
102102
counter_bins_less_than5_relevant_principal_features += 1
103103
if sum(expfreq.flatten() < 1) > 0:
104104
counter_bins_less_than1_relevant_principal_features += 1
105-
pv = scipy.stats.chisquare(freq_data_product.flatten(), expfreq.flatten(),ddof=-1)[1]
106-
# ddof=-1 to have the degrees of freedom of the chi square eaual the number of bins, see corresponding paper (Appendix) for details
105+
pv = scipy.stats.chisquare(freq_data_product.flatten(), expfreq.flatten(),ddof=(freq_data_product.shape[0]-1)+(freq_data_product.shape[1]-1))[1]
106+
# According to the documentation of scipy.stats.chisquare, the degrees of freedom is k-1 - ddof where ddof=0 by default and k=freq_data_product.shape[0]*freq_data_product.shape[0].
107+
# According to literatur, the chi square test statistic for a test of independence (r x m contingency table) is approximately chi square distributed (under some assumptions) with degrees of freedom equal
108+
# freq_data_product.shape[0]-1)*(freq_data_product.shape[1]-1) = freq_data_product.shape[0]*freq_data_product.shape[1] - freq_data_product.shape[0] - freq_data_product.shape[1] + 1.
109+
# Consequently, ddof is set equal freq_data_product.shape[0]-1+freq_data_product.shape[1]-1 to adjust the degrees of freedom accordingly.
110+
107111
# if p-value pv is less than alpha the hypothesis that j is independent of the output function is rejected
108112
if pv <= alpha:
109113
dependent=1

src/parallel_principal_feature_analysis/principal_feature_analysis.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,12 @@ def principal_feature_analysis(cluster_size,data,number_output_functions,freq_da
6060
counter_bin_less_than5 += 1
6161
if sum(expfreq.flatten() < 1) > 0:
6262
counter_bin_less_than1 += 1
63-
pv = scipy.stats.chisquare(freq_data_product.flatten(), expfreq.flatten(),ddof=-1)[1]
64-
# ddof=-1 to have the degrees of freedom of the chi square eaual the number of bins, see corresponding paper (Appendix) for details
63+
pv = scipy.stats.chisquare(freq_data_product.flatten(), expfreq.flatten(),ddof=(freq_data_product.shape[0]-1)+(freq_data_product.shape[1]-1))[1]
64+
# According to the documentation of scipy.stats.chisquare, the degrees of freedom is k-1 - ddof where ddof=0 by default and k=freq_data_product.shape[0]*freq_data_product.shape[0].
65+
# According to literatur, the chi square test statistic for a test of independence (r x m contingency table) is approximately chi square distributed (under some assumptions) with degrees of freedom equal
66+
# freq_data_product.shape[0]-1)*(freq_data_product.shape[1]-1) = freq_data_product.shape[0]*freq_data_product.shape[1] - freq_data_product.shape[0] - freq_data_product.shape[1] + 1.
67+
# Consequently, ddof is set equal freq_data_product.shape[0]-1+freq_data_product.shape[1]-1 to adjust the degrees of freedom accordingly.
68+
6569
# if p-value pv is less than alpha the hypothesis that j is independent of i is rejected
6670
if pv <= alpha:
6771
global_adjm[cluster[i], cluster[j] ] = 1

0 commit comments

Comments
 (0)