Update: dof for chi-square test

LauritzR · LauritzR · commit 5f6389a882f7 · 2023-04-07T14:51:22.000+02:00
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name="parallel-principal-feature-analysis",
-    version="1.0.3",
+    version="1.0.4",
     author="Tim Breitenbach & Lauritz Rasbach",
     author_email="tim.breitenbach@mathematik.uni-wuerzburg.de, rasbachlauritz@googlemail.com",
     description="The first package for (parallel) Principal Feature Analysis",
diff --git a/src/parallel_principal_feature_analysis/find_relevant_principal_features.py b/src/parallel_principal_feature_analysis/find_relevant_principal_features.py
@@ -102,8 +102,12 @@ def find_relevant_principal_features(data,number_output_functions,cluster_size,a
                         counter_bins_less_than5_relevant_principal_features += 1
                     if sum(expfreq.flatten() < 1) > 0:
                         counter_bins_less_than1_relevant_principal_features += 1
-                    pv = scipy.stats.chisquare(freq_data_product.flatten(), expfreq.flatten(),ddof=-1)[1]
-                    # ddof=-1 to have the degrees of freedom of the chi square eaual the number of bins, see corresponding paper (Appendix) for details
+                    pv = scipy.stats.chisquare(freq_data_product.flatten(), expfreq.flatten(),ddof=(freq_data_product.shape[0]-1)+(freq_data_product.shape[1]-1))[1]
+                    # According to the documentation of scipy.stats.chisquare, the degrees of freedom is k-1 - ddof where ddof=0 by default and k=freq_data_product.shape[0]*freq_data_product.shape[0]. 
+                    # According to literatur, the chi square test statistic for a test of independence (r x m contingency table) is approximately chi square distributed (under some assumptions) with degrees of freedom equal 
+                    # freq_data_product.shape[0]-1)*(freq_data_product.shape[1]-1) = freq_data_product.shape[0]*freq_data_product.shape[1] - freq_data_product.shape[0] - freq_data_product.shape[1] + 1. 
+                    # Consequently, ddof is set equal freq_data_product.shape[0]-1+freq_data_product.shape[1]-1 to adjust the degrees of freedom accordingly.
+
                     # if p-value pv is less than alpha the hypothesis that j is independent of the output function is rejected
                     if pv <= alpha:
                         dependent=1
diff --git a/src/parallel_principal_feature_analysis/principal_feature_analysis.py b/src/parallel_principal_feature_analysis/principal_feature_analysis.py
@@ -60,8 +60,12 @@ def principal_feature_analysis(cluster_size,data,number_output_functions,freq_da
                                 counter_bin_less_than5 += 1
                             if sum(expfreq.flatten() < 1) > 0:
                                 counter_bin_less_than1 += 1
-                            pv = scipy.stats.chisquare(freq_data_product.flatten(), expfreq.flatten(),ddof=-1)[1]
-                            # ddof=-1 to have the degrees of freedom of the chi square eaual the number of bins, see corresponding paper (Appendix) for details
+                            pv = scipy.stats.chisquare(freq_data_product.flatten(), expfreq.flatten(),ddof=(freq_data_product.shape[0]-1)+(freq_data_product.shape[1]-1))[1]
+                            # According to the documentation of scipy.stats.chisquare, the degrees of freedom is k-1 - ddof where ddof=0 by default and k=freq_data_product.shape[0]*freq_data_product.shape[0]. 
+                            # According to literatur, the chi square test statistic for a test of independence (r x m contingency table) is approximately chi square distributed (under some assumptions) with degrees of freedom equal 
+                            # freq_data_product.shape[0]-1)*(freq_data_product.shape[1]-1) = freq_data_product.shape[0]*freq_data_product.shape[1] - freq_data_product.shape[0] - freq_data_product.shape[1] + 1. 
+                            # Consequently, ddof is set equal freq_data_product.shape[0]-1+freq_data_product.shape[1]-1 to adjust the degrees of freedom accordingly.
+
                             # if p-value pv is less than alpha the hypothesis that j is independent of i is rejected
                             if pv <= alpha:
                                 global_adjm[cluster[i], cluster[j] ] = 1