deploy: bf13075

beringresearch · Jun 12, 2024 · cd83d35 · cd83d35
commit cd83d35
Show file tree

Hide file tree

Showing 153 changed files with 12,013 additions and 0 deletions.
diff --git a/.buildinfo b/.buildinfo
@@ -0,0 +1,4 @@
+# Sphinx build info version 1
+# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
+config: 673df79abd68348107e6bbf88e2e9e72
+tags: 645f666f9bcd5a90fca523b33c5a78b7
diff --git a/.doctrees/api.doctree b/.doctrees/api.doctree
diff --git a/.doctrees/comparisons.doctree b/.doctrees/comparisons.doctree
diff --git a/.doctrees/embeddings_benchmarks.doctree b/.doctrees/embeddings_benchmarks.doctree
diff --git a/.doctrees/environment.pickle b/.doctrees/environment.pickle
diff --git a/.doctrees/examples.doctree b/.doctrees/examples.doctree
diff --git a/.doctrees/hyperparameters.doctree b/.doctrees/hyperparameters.doctree
diff --git a/.doctrees/index.doctree b/.doctrees/index.doctree
diff --git a/.doctrees/metric_learning.doctree b/.doctrees/metric_learning.doctree
diff --git a/.doctrees/oom_datasets.doctree b/.doctrees/oom_datasets.doctree
diff --git a/.doctrees/python_package.doctree b/.doctrees/python_package.doctree
diff --git a/.doctrees/r_package.doctree b/.doctrees/r_package.doctree
diff --git a/.doctrees/scanpy_singlecell.doctree b/.doctrees/scanpy_singlecell.doctree
diff --git a/.doctrees/semi_supervised.doctree b/.doctrees/semi_supervised.doctree
diff --git a/.doctrees/supervised.doctree b/.doctrees/supervised.doctree
diff --git a/.doctrees/timings_benchmarks.doctree b/.doctrees/timings_benchmarks.doctree
diff --git a/.doctrees/unsupervised.doctree b/.doctrees/unsupervised.doctree
diff --git a/.nojekyll b/.nojekyll
diff --git a/_images/FigureS1.png b/_images/FigureS1.png
diff --git a/_images/SVM-accuracy-classification-weight-zoomed.png b/_images/SVM-accuracy-classification-weight-zoomed.png
diff --git a/_images/SVM-classification-weight-impact-mnist.png b/_images/SVM-classification-weight-impact-mnist.png
diff --git a/_images/accuracy-classification-weight-zoomed.png b/_images/accuracy-classification-weight-zoomed.png
diff --git a/_images/boston_test_regression_mae_pred-true.png b/_images/boston_test_regression_mae_pred-true.png
diff --git a/_images/boston_train_regression_mae_pred-true.png b/_images/boston_train_regression_mae_pred-true.png
diff --git a/_images/classification-weight-impact-mnist.jpg b/_images/classification-weight-impact-mnist.jpg
diff --git a/_images/classification-weight-impact-mnist_mask-0.5.png b/_images/classification-weight-impact-mnist_mask-0.5.png
diff --git a/_images/classification-weight-softmax-confidence-impact-mnist.png b/_images/classification-weight-softmax-confidence-impact-mnist.png
diff --git a/_images/comparisons_ivis_umap_levine_distances.png b/_images/comparisons_ivis_umap_levine_distances.png
diff --git a/_images/comparisons_moons.png b/_images/comparisons_moons.png
diff --git a/_images/comparisons_swiss_roll.png b/_images/comparisons_swiss_roll.png
diff --git a/_images/ivis_aorta_all_markers.png b/_images/ivis_aorta_all_markers.png
diff --git a/_images/ivis_embeddings_benchmarks.png b/_images/ivis_embeddings_benchmarks.png
diff --git a/_images/ivis_k_accuracy.png b/_images/ivis_k_accuracy.png
diff --git a/_images/ivis_k_embeddings.png b/_images/ivis_k_embeddings.png
diff --git a/_images/ivis_patience_boxplots.png b/_images/ivis_patience_boxplots.png
diff --git a/_images/ivis_retinal_bipolar_cells.png b/_images/ivis_retinal_bipolar_cells.png
diff --git a/_images/ivis_timings_100000_1000000.png b/_images/ivis_timings_100000_1000000.png
diff --git a/_images/ivis_timings_1000_5000.png b/_images/ivis_timings_1000_5000.png
diff --git a/_images/ivis_timings_5000_20000.png b/_images/ivis_timings_5000_20000.png
diff --git a/_images/kdd99-ivis-demo.png b/_images/kdd99-ivis-demo.png
diff --git a/_images/mask-vs-supervision_rf-heatmap.png b/_images/mask-vs-supervision_rf-heatmap.png
diff --git a/_images/metric_learning.png b/_images/metric_learning.png
diff --git a/_images/mnist-embedding-comparison_titled.png b/_images/mnist-embedding-comparison_titled.png
diff --git a/_images/mnist-testingset-semisupervised-0.5weight.png b/_images/mnist-testingset-semisupervised-0.5weight.png
diff --git a/_images/output_24_1.png b/_images/output_24_1.png
diff --git a/_images/random_projections_benchmaks.png b/_images/random_projections_benchmaks.png
diff --git a/_images/swiss_roll_model.png b/_images/swiss_roll_model.png
diff --git a/_sources/api.rst.txt b/_sources/api.rst.txt
@@ -0,0 +1,70 @@
+.. _api:
+
+Ivis
+====
+
+.. currentmodule:: ivis
+
+.. autoclass:: Ivis
+  :members:
+  :undoc-members:
+  :inherited-members:
+  :show-inheritance:
+
+Neighbour Retrieval
+===================
+.. autoclass:: ivis.data.neighbour_retrieval.NeighbourMatrix
+  :members:
+  :show-inheritance:
+
+.. autoclass:: ivis.data.neighbour_retrieval.AnnoyKnnMatrix
+  :members:
+  :show-inheritance:
+  :special-members:
+  :exclude-members: __weakref__
+
+.. autoclass:: ivis.data.neighbour_retrieval.LabeledNeighbourMap
+  :members:
+  :show-inheritance:
+  :special-members:
+  :exclude-members: __weakref__
+
+.. autofunction:: ivis.data.neighbour_retrieval.knn.build_annoy_index
+
+Indexable Datasets
+==================
+.. autoclass:: ivis.data.sequence.IndexableDataset
+  :members:
+  :show-inheritance:
+  :special-members:
+  :exclude-members: __weakref__
+
+.. autoclass:: ivis.data.sequence.ImageDataset
+  :members:
+  :show-inheritance:
+  :special-members:
+  :exclude-members: __weakref__
+
+.. autoclass:: ivis.data.sequence.FlattenedImageDataset
+  :members:
+  :show-inheritance:
+  :special-members:
+  :exclude-members: __weakref__
+
+
+Losses
+======
+
+.. automodule:: ivis.nn.losses
+  :members:
+
+Callbacks
+=========
+
+.. automodule:: ivis.nn.callbacks
+  :members:
+  :show-inheritance:
+
+.. raw:: html
+
+    <video controls loop="true" autoplay="autoplay" width="560" height="315" src="_static/tensorboard_embeddings_plots.mp4"></video>
diff --git a/_sources/comparisons.rst.txt b/_sources/comparisons.rst.txt
@@ -0,0 +1,86 @@
+.. _comparisons:
+
+
+Comparing ivis with other dimensionality reduction algorithms
+=============================================================
+
+Ivis aims to reduce data dimensionality whilst preserving both global
+and local structures. There are a number of real-world applications
+where this feature could be useful. For example:
+
+-  Anomaly detection
+-  Biological interpretation of high-throughput experiments
+-  Feature extraction
+
+Several algorithms have been proposed to address the problem of
+dimensionality reduction, including
+`UMAP <https://umap-learn.readthedocs.io/en/latest/>`__ and
+`t-SNE <https://lvdmaaten.github.io/tsne/>`__. UMAP in particular, has
+been succesfully applied in machine learning pipelines. Ivis is
+different to these approaches in several ways.
+
+First, ``ivis`` does not make any assumptions as to the inherent structure
+of the dataset. Second, ``ivis`` is designed
+to handle both small and extremely large datasets. Ivis performs well on
+toy datasets such as the *iris* dataset, and scales linearly to datasets
+with millions of observations. Indeed, we see that the main usecase for ``ivis`` are datasets with > 250,000 observations. Finally, ``ivis`` prioritises interpretation
+over visual apperance - this is accomplished by imposing meaning to
+distances between points in the embedding space. As such, ``ivis`` does not
+create spurious clusters nor does it artificially pack clusters closer
+together. Embeddings aim to be true to the original structure of the
+data, which can be noisy in a real-world setting.
+
+
+
+
+Visual Assessment
+------------------
+
+We will visually examine how popular dimensionality reduction algorithms
+- UMAP, t-SNE, Isomap, MDS, and PCA - approach two synthetic datasets
+with 5,000 observations in each. Since we are concerned with a
+dimensionality reduction problem, we will artificially add reduntant
+features to the original datasets using polynomial combinations (degree
+≤ 10) of the original features.
+
+Random Noise
+~~~~~~~~~~~~
+
+To start, let's examine how various dimensionality reduction methods behave in the presence of random noise. We generated 5000 uniformly distributed random points in a two-dimensional space and expanded the feature set using polynomial combinations. In all cases default parameters were used to fit multiple models.
+
+.. image:: _static/random_projections_benchmaks.png
+
+Both ``ivis`` and PCA reliably recovered the random nature of our dataset. Conversely, Isomap, UMAP, and t-SNE appeared to pack certain points together, creating an impression of clusters within uniform random noise.
+
+Structured Datasets
+~~~~~~~~~~~~~~~~~~~
+
+Next, we examine how well global features of a dataset, such as relative position of clusters, can be recovered in a low-dimensional space.
+
+.. image:: _static/comparisons_moons.png
+
+
+Using default parameters, we can see that ``ivis`` captures both
+the general structure of each half-moon, as well as their relative
+positions to one another. Both UMAP and t-SNE appear to introduce
+spurious clusters and global relationships between the half-moons appear
+to be disrupted.
+
+
+.. image:: _static/comparisons_swiss_roll.png 
+
+
+Similarly as above, UMAP and t-SNE appear to generate a large number of small clusters along
+the continuous distribution of the dataset. Although the global
+structure is relatively well-preserved. ``ivis`` maintains both global and
+local structures of the dataset.
+
+Quantitative Evaluation
+-----------------------
+
+To measure how well each algorithm preserves global distances, we examined correlation between points in the original dataset and the embedding space. For this analysis, 10,000 observations were chosen from the `Levine dataset <https://github.com/lmweber/benchmark-data-Levine-32-dim>`__ (104,184 x 32) using random uniform sampling. Box plots represent distances across pairs of points in the embeddings, binned using 50 equal-width bins over the pairwise distances in the original
+space. Pearson correlation coefficients were also computed over the pairs of distances.
+
+ .. image:: _static/comparisons_ivis_umap_levine_distances.png 
+
+ ``ivis`` appeared to preserve both a small-, mid-, and large-scale  L1 and L2 distances, whilst UMAP and t-SNE seemed to ignore mid- to large-scale distances. Interestingly, ``ivis`` was particularly good at preserving L2 distances in low-dimensional space.
diff --git a/_sources/embeddings_benchmarks.rst.txt b/_sources/embeddings_benchmarks.rst.txt
@@ -0,0 +1,54 @@
+.. _embeddings_benchmarks:
+
+Distance Preservation Benchmarks
+=================================
+
+Dimensionality reduction is crucial for effective manipulation of high-dimensional datasets. However, low-dimensional representations often fail to capture complex global and local relationships in many real-world datasets. Here, we assess how well ``ivis`` preserves inter-cluster distances in two well-characterised datasets and benchmark performance across several linear and non-linear dimensinality reduction approaches.
+
+
+Datasets Selection
+------------------
+
+Two benchmark datasets were used - MNIST database of handwritten digits (70,000 observations, 784 features) and Levine dataset (104,184 observations, 32 features). The Levine dataset was obtained from `Data-Driven Phenotypic Dissection of AML Reveals Progenitor-like Cells that Correlate with Prognosis <http://www.sciencedirect.com/science/article/pii/S0092867415006376>`_. The 32-dimensional Levine dataset can be `downloaded directly from Cytobank <https://www.cytobank.org/cytobank/experiments/46102>`_.
+
+Both datasets have target ``Y`` variables. For MNIST, targets take on values [0, 9] and represent hand-written digits, whilst in the Levine dataset targets are manually annotated cell populations [0-13]. Prior to preprocessing, values in both datasets were scaled to [0, 1] range.
+
+* MNIST preprocessing:
+
+.. code-block:: python
+
+    from sklearn.datasets import fetch_openml
+    from sklearn.preprocessing import MinMaxScaler
+    X, Y = fetch_openml('mnist_784', version=1, return_X_y=True)
+    X = MinMaxScaler().fit_transform(X)
+
+* Levine preprocessing:
+
+.. code-block:: python
+
+    import pandas as pd
+    from sklearn.preprocessing import LabelEncoder, MinMaxScaler
+
+    data = pd.read_csv('../data/levine_32dm_notransform.txt')
+    data = data.dropna()
+
+    features = ['CD45RA', 'CD133', 'CD19', 'CD22', 'CD11b', 'CD4', 'CD8',
+                'CD34', 'Flt3', 'CD20', 'CXCR4', 'CD235ab', 'CD45', 'CD123',
+                'CD321', 'CD14', 'CD33', 'CD47', 'CD11c', 'CD7', 'CD15', 'CD16',
+                'CD44', 'CD38', 'CD13', 'CD3', 'CD61', 'CD117', 'CD49d',
+                'HLA-DR', 'CD64', 'CD41', 'label']
+    data = data[features]
+
+    X = data.drop(['label'], axis=1).values
+    X = np.arcsinh(X/5)
+    X = MinMaxScaler().fit_transform(X) 
+
+Accuracy of Low-Dimensional Embeddings
+--------------------------------------
+
+To establish how well ``ivis`` and other dimensionality reduction techniques preserve data structure in low-dimensional space, a Euclidean distance matrix between centroids of the target values in Levine and MNIST datasets was created for the original datasets, respective ``ivis`` embeddings, as well as UMAP, t-SNE, MDS, and Isomap embeddings. The level of correlation between the original distance matrix and the distance matrices in the embedding spaces was then assessed using the `Mantel test <https://en.wikipedia.org/wiki/Mantel_test>`_. Pearson’s product-moment correlation coefficient (PCC) was used to quantitate concordance between original data and low-dimensional representations. Random stratified subsamples (n=50) of 1000 observations were used to generate a continuum of PCC values for each embedding technique. For all ``ivis`` runs, only two hyperparameters were set: ``k=15`` and ``model="maaten"``. These are recommended defaults for datasets with <500,000 observations. For other dimensionality reduction methods, default parameters were used.
+
+
+.. image:: _static/ivis_embeddings_benchmarks.png
+
+The Mantel Test measures correlation between two distance matrices - embedding space and original space Euclidean distances of cluster centroids. From our experiment, we can conclude that ``ivis`` preserves inter-cluster distances well, with average PCC being ~0.75 in the MNIST and Levine datasets. Importantly, ``ivis`` outperformes other dimensionality reduction techniques.
diff --git a/_sources/examples.rst.txt b/_sources/examples.rst.txt
@@ -0,0 +1,42 @@
+.. _examples:
+
+Examples
+========
+
+You can find here a list of notebooks demonstrating key Ivis functionalities. Also, we would like to list here interesting content created by the community. If you wrote some notebook(s) leveraging Ivis and would like to be listed here, please open a Pull Request so it can be included under the Community notebooks.
+
+.. list-table::
+   :widths: 30 60 10
+   :header-rows: 1
+
+   * - Notebook
+     - Description
+     - Colab
+
+   * - `How to reduce dimensionality of structured data <https://github.com/beringresearch/ivis/blob/master/notebooks/introduction_to_dimensionality_reduction_using_ivis.ipynb>`_
+     - Introduction to Ivis - reduce dimensionality of structured data
+     - |ivis_introduction_colab|_
+
+   * - `How to reduce dimensionality of image data <https://github.com/beringresearch/ivis/blob/master/notebooks/ivis_cnn_backbone_fashion_mnist.ipynb>`_
+     - Reduce dimensionality of image data using Ivis algorithm and a custom convolutional neural network
+     - |ivis_cnn_backbone_fashion_mnist_colab|_
+
+   * - `Using callbacks to assess model training <https://github.com/beringresearch/ivis/blob/master/notebooks/using_callbacks_with_ivis.ipynb>`_
+     - Apply callbacks during ivis training to log and assess intermediate model states.
+     - |using_callbacks_with_ivis_colab|_
+
+   * - `Concept drift detection on image data <https://github.com/beringresearch/ivis/blob/master/notebooks/ivis_concept_drift_detection.ipynb>`_
+     - Detect Concept Drift in image datasets using Ivis.
+     - |ivis_concept_drift_detetion_colab|_
+
+.. |ivis_introduction_colab| image:: https://colab.research.google.com/assets/colab-badge.svg
+.. _ivis_introduction_colab: https://colab.research.google.com/github/beringresearch/ivis/blob/master/notebooks/introduction_to_dimensionality_reduction_using_ivis.ipynb
+
+.. |ivis_cnn_backbone_fashion_mnist_colab| image:: https://colab.research.google.com/assets/colab-badge.svg
+.. _ivis_cnn_backbone_fashion_mnist_colab: https://colab.research.google.com/github/beringresearch/ivis/blob/master/notebooks/ivis_cnn_backbone_fashion_mnist.ipynb
+
+.. |using_callbacks_with_ivis_colab| image:: https://colab.research.google.com/assets/colab-badge.svg
+.. _using_callbacks_with_ivis_colab: https://colab.research.google.com/github/beringresearch/ivis/blob/master/notebooks/using_callbacks_with_ivis.ipynb
+
+.. |ivis_concept_drift_detetion_colab| image:: https://colab.research.google.com/assets/colab-badge.svg
+.. _ivis_concept_drift_detetion_colab: https://colab.research.google.com/github/beringresearch/ivis/blob/master/notebooks/ivis_concept_drift_detection.ipynb