Merge pull request #22 from fidelity/feature/kl_divergence

AshishPvjs · web-flow · commit 731cb585f74f · 2025-08-29T13:36:34.000-04:00
KL Divergence Based Feature Selection
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -16,15 +16,14 @@ jobs:
     strategy:
       matrix:
         python-version: ["3.8", "3.9", "3.10"]
-        os: [ubuntu-latest, macos-latest, windows-latest]
+        os: [ubuntu-latest, windows-latest]
       fail-fast: false
     steps:
       - uses: actions/checkout@v2
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v2
         with:
           python-version: ${{ matrix.python-version }}
-
       - name: Check
         shell: bash
         run: |
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
@@ -2,6 +2,12 @@
 CHANGELOG
 =========
 
+-------------------------------------------------------------------------------
+August 7, 2025 1.2.0
+-------------------------------------------------------------------------------
+
+- Added KL Divergence based feature selection for binary labels. Thanks to @zohairshafi for contributing this method. 
+
 -------------------------------------------------------------------------------
 April, 24, 2023 1.1.2
 -------------------------------------------------------------------------------
diff --git a/README.md b/README.md
@@ -48,7 +48,7 @@ print("Scores:", list(selector.get_absolute_scores()))
 |:--------------------------------------------------------------------------------------------------------------------------:|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
 | [Variance per Feature](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.VarianceThreshold.html) |                                                                                                                                                                                                                                                                                                                                                                                                                                      `threshold`                                                                                                                                                                                                                                                                                                                                                                                                                                       |
 |   [Correlation pairwise Features](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.corr.html)   |                                                                                                                                                                                                                                                                     [Pearson Correlation Coefficient](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient) <br> [Kendall Rank Correlation Coefficient](https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient) <br> [Spearman's Rank Correlation Coefficient](https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient) <br>                                                                                                                                                                                                                                                                      |
-|    [Statistical Analysis](https://scikit-learn.org/stable/modules/feature_selection.html#univariate-feature-selection)     |                                                                                                             [ANOVA F-test Classification](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_classif.html) <br> [F-value Regression](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html) <br> [Chi-Square](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.chi2.html) <br> [Mutual Information Classification](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_classif.html) <br> [Variance Inflation Factor](https://www.statsmodels.org/stable/generated/statsmodels.stats.outliers_influence.variance_inflation_factor.html)                                                                                                              |
+|    [Statistical Analysis](https://scikit-learn.org/stable/modules/feature_selection.html#univariate-feature-selection)     |                                                                                                             [ANOVA F-test Classification](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_classif.html) <br> [F-value Regression](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html) <br> [Chi-Square](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.chi2.html) <br> [KL Divergence](https://en.wikipedia.org/wiki/Kullback–Leibler_divergence) <br> [Mutual Information Classification](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_classif.html) <br> [Variance Inflation Factor](https://www.statsmodels.org/stable/generated/statsmodels.stats.outliers_influence.variance_inflation_factor.html)                                                                                                               |
 |                             [Linear Methods](https://en.wikipedia.org/wiki/Linear_regression)                              |                                                                                                   [Linear Regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html?highlight=linear%20regression#sklearn.linear_model.LinearRegression) <br> [Logistic Regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html?highlight=logistic%20regression#sklearn.linear_model.LogisticRegression) <br> [Lasso Regularization](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html#sklearn.linear_model.Lasso) <br> [Ridge Regularization](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html#sklearn.linear_model.Ridge) <br>                                                                                                    |
 |                          [Tree-based Methods](https://scikit-learn.org/stable/modules/tree.html)                           | [Decision Tree](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier) <br> [Random Forest](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html?highlight=random%20forest#sklearn.ensemble.RandomForestClassifier) <br> [Extra Trees Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html) <br> [XGBoost](https://xgboost.readthedocs.io/en/latest/) <br> [LightGBM](https://lightgbm.readthedocs.io/en/latest/) <br> [AdaBoost](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html) <br> [CatBoost](https://github.com/catboost)<br> [Gradient Boosting Tree](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html) <br> |
 |  [Text-based Methods](https://link.springer.com/chapter/10.1007/978-3-030-78230-6_27)  |                                                                                                                                                                                                                                                                                                                                              `featurization_method` = [TextWiser](https://github.com/fidelity/textwiser) <br> `optimization_method = ["exact", "greedy", "kmeans", "random"]` <br> `cost_metric = ["unicost", "diverse"]`                                                                                                                                                                                                                                                                                                                                              |
@@ -81,6 +81,7 @@ selectors = {
   # Statistical methods
   "stat_anova": SelectionMethod.Statistical(num_features, method="anova"),
   "stat_chi_square": SelectionMethod.Statistical(num_features, method="chi_square"),
+  "stat_kl_divergence": SelectionMethod.Statistical(num_features, method="kl_divergence"),
   "stat_mutual_info": SelectionMethod.Statistical(num_features, method="mutual_info"),
   
   # Linear methods
@@ -168,7 +169,7 @@ plot_importance(df)
 
 ## Installation
 
-Selective requires **Python 3.7+** and can be installed from PyPI using ``pip install selective``.
+Selective requires **Python 3.8+** and can be installed from PyPI using ``pip install selective``.
 
 ## Source 
 
diff --git a/feature/_version.py b/feature/_version.py
@@ -2,4 +2,4 @@
 # Copyright FMR LLC <opensource@fidelity.com>
 # SPDX-License-Identifier: Apache-2.0
 
-__version__ = "1.1.2"
+__version__ = "1.2.0"
diff --git a/feature/kl_divergence.py b/feature/kl_divergence.py
@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+# Copyright FMR LLC <opensource@fidelity.com>
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import NoReturn, Tuple
+
+import pandas as pd
+import numpy as np
+
+from scipy.special import rel_entr
+from feature.base import _BaseSupervisedSelector
+from feature.utils import Num, check_true
+
+
+class _KL_Divergence(_BaseSupervisedSelector):
+
+    def __init__(self, seed: int, num_features: Num, num_bins: Num = 100):
+        super().__init__(seed)
+
+        self.num_features = num_features  # this could be int or float
+        self.num_bins = num_bins
+
+    def fit(self, X: pd.DataFrame, y: pd.Series) -> NoReturn:
+
+        label_categories = np.unique(y)
+        check_true(len(label_categories) == 2, TypeError("Only binary labels are supported for KL Divergence"))
+        input_dimension = X.shape[1]
+        
+        kl_mat = np.zeros((input_dimension, 1))
+        X = X.values
+        
+        class_one_idx = np.where(y == label_categories[0])[0]
+        class_two_idx = np.where(y == label_categories[1])[0]
+        
+        for i in range(input_dimension):
+            
+            # Create two distributions, one for the positive label and one for the negative label
+            f1 = np.histogram(X[class_one_idx, i], bins = self.num_bins)[0]
+            f2 = np.histogram(X[class_two_idx, i], bins = self.num_bins)[0]
+            
+            # Normalize the distributions to be between 0 and 1
+            f1 = f1 / np.sum(f1)
+            f2 = f2 / np.sum(f2)
+        
+            # KL Divergence is not symmetric, so we calculate divergence in both directions 
+            kl = rel_entr(f1, f2)
+            kl_reversed = rel_entr(f2, f1)
+            
+            # The relative entropy function returns KL(P || Q) = np.inf when P == 0 and Q != 0. 
+            kl[kl == np.inf] = 9999
+            kl_reversed[kl_reversed == np.inf] = 9999
+            
+            # The final score is the combination of KL divergence in both directions. 
+            # This could possibly be a flag in a future version to determine which direction to apply KL Divergence
+            # in if bidirectional is not desired. 
+            kl_mat[i] = np.sum(kl) + np.sum(kl_reversed)
+
+        scores_ = kl_mat.flatten()
+        
+        self.scores_ = scores_ # This is used by the statistical.py fit function. 
+        self.abs_scores = scores_ 
+
+    def transform(self, data: pd.DataFrame) -> pd.DataFrame:
+
+        # Select top-k from data based on abs_scores and num_features
+        return self.get_top_k(data, self.abs_scores)
diff --git a/feature/selector.py b/feature/selector.py
@@ -36,7 +36,7 @@
 
 
 __author__ = "FMR LLC"
-__version__ = "1.0.0"
+__version__ = "1.2.0"
 __copyright__ = "Copyright (C), FMR LLC"
 
 
@@ -202,6 +202,13 @@ class Statistical(NamedTuple):
         searching for the optimal binning strategy.
         Note: MIC is dropped from Selective due to inactive MINE library
 
+        The KL Divergence feature importance should only be used with 
+        binary labels. It computes the distribution of a given feature for instances where label == 1 and label == 0.
+        Uses KL divergence between the two distributions as an importance score,
+        where a higher value indicates greater discriminative power of the feature
+        with respect to the binary label. Since KL Divergence is non-symmetric, this method 
+        computer the divergence in both directions and sums them up. 
+
         Notes on Randomness:
             - Mutual Info is non-deterministic, depends on the seed value.
             - The other methods are deterministic
@@ -227,7 +234,7 @@ def _validate(self):
             if isinstance(self.num_features, float):
                 check_true(self.num_features <= 1, ValueError("Num features ratio must be between [0..1]."))
             # "maximal_info" dropped
-            check_true(self.method in ["anova", "chi_square", "mutual_info", "variance_inflation"],
+            check_true(self.method in ["anova", "chi_square", "kl_divergence", "mutual_info", "variance_inflation"],
                        ValueError("Statistical method can only be anova, chi_square, or mutual_info."))
 
     class TreeBased(NamedTuple):
diff --git a/feature/statistical.py b/feature/statistical.py
@@ -10,9 +10,11 @@
 import pandas as pd
 from sklearn.feature_selection import chi2, f_classif, f_regression, mutual_info_classif, mutual_info_regression
 from statsmodels.stats.outliers_influence import variance_inflation_factor
+from scipy.special import rel_entr
 
 from feature.base import _BaseSupervisedSelector, _BaseDispatcher
 from feature.utils import get_selector, Num, get_task_string
+from feature.kl_divergence import _KL_Divergence
 
 
 class _Statistical(_BaseSupervisedSelector, _BaseDispatcher):
@@ -33,15 +35,17 @@ def __init__(self, seed: int, num_features: Num, method: str):
         self.imp = None
 
         # Implementor factory
-        self.factory = {"regression_anova": f_regression,
-                        "regression_chi_square": None,
-                        "regression_mutual_info": partial(mutual_info_regression, random_state=self.seed),
-                        # "regression_maximal_info": MINE(), # dropped
-                        "classification_anova": f_classif,
+        self.factory = {"classification_anova": f_classif,
                         "classification_chi_square": chi2,
                         "classification_mutual_info": partial(mutual_info_classif, random_state=self.seed),
                         # "classification_maximal_info": MINE(), # dropped
-                        "unsupervised_variance_inflation": variance_inflation_factor}
+                        "kl_divergence" : _KL_Divergence(num_features = self.num_features, seed = self.seed),
+                        "regression_anova": f_regression,
+                        "regression_chi_square": None,
+                        "regression_mutual_info": partial(mutual_info_regression, random_state=self.seed),
+                        # "regression_maximal_info": MINE(), # dropped
+                        "unsupervised_variance_inflation": variance_inflation_factor, 
+                        }
 
     def get_model_args(self, selection_method) -> Tuple:
 
@@ -54,14 +58,18 @@ def dispatch_model(self, labels: pd.Series, *args):
         method = args[0]
 
         # Get statistical scoring function
-        if method == "variance_inflation":
+        if method == "kl_divergence":
+            score_func = self.factory.get(method)
+        elif method == "variance_inflation":
             score_func = self.factory.get("unsupervised_" + method)
         else:
             score_func = self.factory.get(get_task_string(labels) + method)
 
         # Check scoring compatibility with task
         if score_func is None:
             raise TypeError(method + " cannot be used for task: " + get_task_string(labels))
+        elif method == "kl_divergence":
+            self.imp = score_func
         elif method == "variance_inflation": # or isinstance(score_func, MINE) (dropped)
             self.imp = score_func
         else:
@@ -82,6 +90,7 @@ def fit(self, data: pd.DataFrame, labels: pd.Series) -> NoReturn:
         if self.method == "variance_inflation":
             # VIF is unsupervised, regression between data and each feature
             self.abs_scores = np.array([variance_inflation_factor(data.values, i) for i in range(data.shape[1])])
+
         else:
             # sklearn selector model
             self.imp.fit(X=data, y=labels)
diff --git a/setup.py b/setup.py
@@ -26,10 +26,10 @@
     packages=setuptools.find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
     classifiers=[
         "License :: OSI Approved :: Apache Software License",
-        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
         "Operating System :: OS Independent",
     ],
     project_urls={"Source": "https://github.com/fidelity/selective"},
     install_requires=required,
-    python_requires=">=3.7"
+    python_requires=">=3.8"
 )
diff --git a/tests/run_all.py b/tests/run_all.py
@@ -5,7 +5,6 @@
 
 import unittest
 
-
 # Test Directory
 start_dir = '.'
 
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
@@ -29,6 +29,7 @@ class TestBenchmark(BaseTest):
         "univ_anova": SelectionMethod.Statistical(num_features, method="anova"),
         "univ_chi_square": SelectionMethod.Statistical(num_features, method="chi_square"),
         "univ_mutual_info": SelectionMethod.Statistical(num_features, method="mutual_info"),
+        "kl_divergence": SelectionMethod.Statistical(num_features, method="kl_divergence"),
         "linear": SelectionMethod.Linear(num_features, regularization="none"),
         "lasso": SelectionMethod.Linear(num_features, regularization="lasso", alpha=alpha),
         "ridge": SelectionMethod.Linear(num_features, regularization="ridge", alpha=alpha),
diff --git a/tests/test_stat_kl.py b/tests/test_stat_kl.py