Merge pull request #70 from DynamicsAndNeuralSystems/jmoo2880-python-upgrade

joshuabmoore · web-flow · commit 5c878832d55d · 2024-05-21T08:45:46.000+10:00
Update pyspi dependencies for python 3.10+ compatibility
diff --git a/.github/SECURITY.md b/.github/SECURITY.md
@@ -13,6 +13,6 @@ currently being supported with security updates.
 
 | Version | Supported          |
 | ------- | ------------------ |
-| 0.4   | :white_check_mark: |
+| 1.1.0   | :white_check_mark: |
 
 
diff --git a/.github/workflows/run_unit_tests.yaml b/.github/workflows/run_unit_tests.yaml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9"]
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
     steps:
       - uses: actions/checkout@v4
       - name: Setup python ${{ matrix.python-version }}
@@ -23,6 +23,7 @@ jobs:
       - name: Install pyspi dependencies
         run: |
           python -m pip install --upgrade pip
+          pip install setuptools
           pip install -r requirements.txt
           pip install .
       - name: Run pyspi calculator/utils unit tests
diff --git a/README.md b/README.md
@@ -14,7 +14,7 @@
     <a href="https://www.gnu.org/licenses/gpl-3.0"><img src="https://img.shields.io/badge/License-GPLv3-blue.svg" height="20"/></a>
     <a href="https://github.com/DynamicsAndNeuralSystems/pyspi/actions/workflows/run_unit_tests.yaml"><img src="https://github.com/DynamicsAndNeuralSystems/pyspi/actions/workflows/run_unit_tests.yaml/badge.svg" height="20"/></a>
     <a href="https://twitter.com/compTimeSeries"><img src="https://img.shields.io/twitter/url/https/twitter.com/compTimeSeries.svg?style=social&label=Follow%20%40compTimeSeries" height="20"/></a><br>
-    <a href="https://www.python.org"><img src="https://img.shields.io/badge/Python-3.8%20|%203.9-3776AB.svg?style=flat&logo=python&logoColor=white" alt="Python 3.8 | 3.9"></a>
+    <a href="https://www.python.org"><img src="https://img.shields.io/badge/Python-3.8%20|%203.9%20|%203.10%20|%203.11%20|%203.12-3776AB.svg?style=flat&logo=python&logoColor=white" alt="Python 3.8 | 3.9 | 3.10 | 3.11 | 3.12"></a>
 </p>
 
 _pyspi_ is a comprehensive python library for computing statistics of pairwise interactions (SPIs) from multivariate time-series (MTS) data.
@@ -74,21 +74,6 @@ Once you have installed _pyspi_, you can learn how to apply the package by check
 
 ### Advanced Usage
 For advanced users, we offer several additional guides in the [full documentation](https://time-series-features.gitbook.io/pyspi/usage/advanced-usage) on how you can distribute your _pyspi_ jobs across PBS clusters, as well as how you can construct your own subsets of SPIs. 
-Click one of the following dropdowns for more information:
-
-<details closed>
-<summary>Distributing pyspi calculations</summary>
-<p>If you have access to a PBS cluster and are processing MTS with many processes (or are analyzing many MTS), then you may find the <a href="https://github.com/DynamicsAndNeuralSystems/pyspi-distribute"><em>pyspi distribute</em></a> repository helpful.
-In the full <a href="https://time-series-features.gitbook.io/pyspi/usage/advanced-usage/distributing-calculations-on-a-cluster">documentation </a>, we provide a comprehensive guide on how you can distribute <em>pyspi</em> calculations on a PBS cluster, along with the necessary scripts and commands to get started!</p>
-</details>
-
-<details closed>
-<summary>Reduced subsets</summary>
-<p>If your dataset is large (containing many processes and/or observations), you can use a pre-configured set of reduced statistics or create your own subsets.
-Follow the guide in the <a href="https://time-series-features.gitbook.io/pyspi/usage/advanced-usage/using-a-reduced-spi-set">documentation </a> to learn how you can create your own reduced subsets.</p>
-</details>
-
-
 
 ## SPI Descriptions 📋
 To access a table with a high-level overview of the _pyspi_ library of SPIs, including their associated identifiers, see the [table of SPIs](https://time-series-features.gitbook.io/pyspi/spis/table-of-spis) in the full documentation.
@@ -167,4 +152,3 @@ Below are some of the leading contributors to _pyspi_:
 
 ## License 🧾
 _pyspi_ is released under the [GNU General Public License](https://www.gnu.org/licenses/gpl-3.0).
-
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "pyspi"
-version = "1.0.3"
+version = "1.1.0"
 authors = [
     { name ="Oliver M. Cliff", email="oliver.m.cliff@gmail.com"},
 ]
@@ -15,7 +15,7 @@ maintainers = [
 description = "Library for pairwise analysis of time series data."
 readme = "README.md"
 license = {text = "GNU General Public License v3 (GPLv3)"}
-requires-python = ">=3.8,<3.10"
+requires-python = ">=3.8"
 classifiers = [
     "Programming Language :: Python",
     "Programming Language :: Python :: 3",
diff --git a/pyspi/calculator.py b/pyspi/calculator.py
@@ -8,7 +8,7 @@
 
 # From this package
 from .data import Data
-from .utils import convert_mdf_to_ddf, check_optional_deps
+from .utils import convert_mdf_to_ddf, check_optional_deps, inspect_calc_results
 
 
 class Calculator:
@@ -34,14 +34,18 @@ class Calculator:
             A pre-configured subset of SPIs to use. Options are "all", "fast", "sonnet", or "fabfour", defaults to "all".
         configfile (str, optional):
             The location of the YAML configuration file for a user-defined subset. See :ref:`Using a reduced SPI set`, defaults to :code:`'</path/to/pyspi>/pyspi/config.yaml'`
+        normalise (bool, optional):
+            Normalise the dataset along the time axis before computing SPIs, defaults to True.
     """
     _optional_dependencies = None
 
     def __init__(
-        self, dataset=None, name=None, labels=None, subset="all", configfile=None
+        self, dataset=None, name=None, labels=None, subset="all", configfile=None,
+        normalise=True
     ):
         self._spis = {}
         self._excluded_spis = list()
+        self._normalise = normalise
 
         # Define configfile by subset if it was not specified
         if configfile is None:
@@ -252,7 +256,7 @@ def load_dataset(self, dataset):
                 New dataset to attach to calculator.
         """
         if not isinstance(dataset, Data):
-            self._dataset = Data(Data.convert_to_numpy(dataset))
+            self._dataset = Data(Data.convert_to_numpy(dataset), normalise=self._normalise)
         else:
             self._dataset = dataset
 
@@ -293,7 +297,9 @@ def compute(self):
                 warnings.warn(f'Caught {type(err)} for SPI "{spi}": {err}')
                 self._table[spi] = np.NaN
         pbar.close()
-
+        print(f"\nCalculation complete. Time taken: {pbar.format_dict['elapsed']:.4f}s")
+        inspect_calc_results(self)
+        
     def _rmmin(self):
         """Iterate through all spis and remove the minimum (fixes absolute value errors when correlating)"""
         for spi in self.spis:
diff --git a/pyspi/data.py b/pyspi/data.py
@@ -177,11 +177,14 @@ def set_data(
             data = data[:, :n_observations]
 
         if self.normalise:
+            print("Normalising the dataset...\n")
             data = zscore(data, axis=1, nan_policy="omit", ddof=1)
             try:
                 data = detrend(data, axis=1)
             except ValueError as err:
                 print(f"Could not detrend data: {err}")
+        else:
+            print("Skipping normalisation of the dataset...\n")
 
         nans = np.isnan(data)
         if nans.any():
diff --git a/pyspi/statistics/causal.py b/pyspi/statistics/causal.py
@@ -2,6 +2,8 @@
 import pandas as pd
 from cdt.causality.pairwise import ANM, CDS, IGCI, RECI
 import pyEDM
+from sklearn.gaussian_process import GaussianProcessRegressor
+from cdt.causality.pairwise.ANM import normalized_hsic
 
 from pyspi.base import Directed, Unsigned, Signed, parse_bivariate, parse_multivariate
 
@@ -11,6 +13,15 @@ class AdditiveNoiseModel(Directed, Unsigned):
     name = "Additive noise model"
     identifier = "anm"
     labels = ["unsigned", "causal", "unordered", "linear", "directed"]
+    
+    # monkey-patch the anm_score function, see cdt PR #155
+    def corrected_anm_score(self, x, y):
+        gp = GaussianProcessRegressor(random_state=42).fit(x, y)
+        y_predict = gp.predict(x).reshape(-1, 1) 
+        indepscore = normalized_hsic(y_predict - y, x)
+        return indepscore
+    
+    ANM.anm_score = corrected_anm_score
 
     @parse_bivariate
     def bivariate(self, data, i=None, j=None):
diff --git a/pyspi/statistics/misc.py b/pyspi/statistics/misc.py
@@ -1,5 +1,6 @@
 import warnings
 import numpy as np
+import inspect
 
 from statsmodels.tsa import stattools
 from statsmodels.tsa.vector_ar.vecm import coint_johansen
@@ -115,7 +116,11 @@ def bivariate(self, data, i=None, j=None):
         z = data.to_numpy()
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
-            mdl = self._model().fit(z[i], np.ravel(z[j]))
+            model_params = inspect.signature(self._model).parameters
+            if "random_state" in model_params:
+                mdl = self._model(random_state=42).fit(z[i], np.ravel(z[j]))
+            else:
+                mdl = self._model().fit(z[i], np.ravel(z[j]))
         y_predict = mdl.predict(z[i])
         return mean_squared_error(y_predict, np.ravel(z[j]))
 
diff --git a/pyspi/utils.py b/pyspi/utils.py
@@ -228,3 +228,47 @@ def filter_spis(keywords, output_name = None, configfile= None):
 - Next Steps: To utilise the filtered set of SPIs, please initialise a new Calculator instance with the following command:
 `Calculator(configfile='{output_file}')`
 """)
+
+def inspect_calc_results(calc):
+    total_num_spis = calc.n_spis
+    num_procs = calc.dataset.n_processes
+    spi_results = dict({'Successful': list(), 'NaNs': list(), 'Partial NaNs': list()})
+    for key in calc.spis.keys():
+        if calc.table[key].isna().all().all():
+            spi_results['NaNs'].append(key)
+        elif calc.table[key].isnull().values.sum() > num_procs:
+            # off-diagonal NaNs
+            spi_results['Partial NaNs'].append(key)
+        else:
+            # returned numeric values (i.e., not NaN)
+            spi_results['Successful'].append(key)
+    
+    # print summary
+    double_line_60 = "="*60
+    single_line_60 = "-"*60
+    print("\nSPI Computation Results Summary")
+    print(double_line_60)
+    print(f"\nTotal number of SPIs attempted: {total_num_spis}")
+    print(f"Number of SPIs successfully computed: {len(spi_results['Successful'])} ({len(spi_results['Successful']) / total_num_spis * 100:.2f}%)")
+    print(single_line_60)
+    print("Category       | Count | Percentage")
+    print(single_line_60)
+    for category, spis in spi_results.items():
+        count = len(spis)
+        percentage = (count / total_num_spis) * 100
+        print(f"{category:14} | {count:5} | {percentage:6.2f}%")
+    print(single_line_60)
+
+    if spi_results['NaNs']:
+        print(f"\n[{len(spi_results['NaNs'])}] SPI(s) produced NaN outputs:")
+        print(single_line_60)
+        for i, spi in enumerate(spi_results['NaNs']):
+            print(f"{i+1}. {spi}")
+        print(single_line_60 + "\n")
+    if spi_results['Partial NaNs']:
+        print(f"\n[{len(spi_results['Partial NaNs'])}] SPIs which produced partial NaN outputs:")
+        print(single_line_60)
+        for i, spi in enumerate(spi_results['Partial NaNs']):
+            print(f"{i+1}. {spi}")
+        print(single_line_60 + "\n")
+    
diff --git a/requirements.txt b/requirements.txt
@@ -1,21 +1,22 @@
 pytest
-scikit-learn==1.0.1
-scipy==1.7.3
-numpy>=1.21.1
-pandas==1.5.0
-statsmodels==0.12.1
-pyyaml==5.4
-tqdm==4.50.2
-nitime==0.9
-hyppo==0.2.1
-pyEDM==1.9.3
-jpype1==1.2.0
-sktime==0.8.0
-dill==0.3.2
-spectral-connectivity==0.2.4.dev0
-torch==1.13.1
+h5py
+scikit-learn
+scipy
+numpy
+pandas
+statsmodels
+pyyaml
+tqdm
+nitime
+hyppo
+pyEDM==1.15.2.0
+jpype1
+sktime
+dill
+spectral-connectivity
+torch
 cdt==0.5.23
-oct2py==5.2.0
-tslearn==0.5.2
+oct2py
+tslearn
 mne==0.23.0
-seaborn==0.11.0
+seaborn
diff --git a/setup.py b/setup.py
@@ -8,26 +8,27 @@
 
 
 install_requires = [
-        'scikit-learn==1.0.1',
-        'scipy==1.7.3',
-        'numpy>=1.21.1',
-        'pandas==1.5.0',
-        'statsmodels==0.12.1',
-        'pyyaml==5.4',
-        'tqdm==4.50.2',
-        'nitime==0.9',
-        'hyppo==0.2.1',
-        'pyEDM==1.9.3',
-        'jpype1==1.2.0',
-        'sktime==0.8.0',
-        'dill==0.3.2',
-        'spectral-connectivity==0.2.4.dev0',
-        'torch==1.13.1',
+        'h5py',
+        'scikit-learn',
+        'scipy',
+        'numpy',
+        'pandas',
+        'statsmodels',
+        'pyyaml',
+        'tqdm',
+        'nitime',
+        'hyppo',
+        'pyEDM==1.15.2.0',
+        'jpype1',
+        'sktime',
+        'dill',
+        'spectral-connectivity',
+        'torch',
         'cdt==0.5.23',
-        'oct2py==5.2.0',
-        'tslearn==0.5.2',
+        'oct2py',
+        'tslearn',
         'mne==0.23.0',
-        'seaborn==0.11.0'
+        'seaborn'
 ]
 
 testing_extras = [
@@ -61,7 +62,7 @@
                         'data/standard_normal.npy',
                         'data/cml7.npy']},
     include_package_data=True,
-    version='1.0.3',
+    version='1.1.0',
     description='Library for pairwise analysis of time series data.',
     author='Oliver M. Cliff',
     author_email='oliver.m.cliff@gmail.com',
diff --git a/tests/CML7_benchmark_tables.pkl b/tests/CML7_benchmark_tables.pkl
diff --git a/tests/test_calc.py b/tests/test_calc.py
@@ -284,7 +284,6 @@ def test_calculator_frame_normal_operation():
     # check that compute runs
     calc_frame.compute()
 
-
 def test_correlation_frame_normal_operation():
     """Test whether the correlation frame instantiates as expected.""" 
     datasets = [np.random.randn(3, 100) for _ in range(3)]
@@ -297,3 +296,13 @@ def test_correlation_frame_normal_operation():
     cf = calc_frame.get_correlation_df()
 
     assert not(cf[0].empty), "Correlation frame is empty."
+
+def test_normalisation_flag():
+    """Test whether the normalisation flag when instantiating
+    the calculator works as expected."""
+    data = np.random.randn(3, 100)
+    calc = Calculator(dataset=data, normalise=False)
+    calc_loaded_dataset = calc.dataset.to_numpy().squeeze()
+    
+    assert (calc_loaded_dataset == data).all(), f"Calculator normalise=False not producing the correct output." 
+