Skip to content

Commit 8790e85

Browse files
Merge pull request #55 from dillondaudert/v0.2-dev
V0.2 dev
2 parents b738d5e + 0b8f9b6 commit 8790e85

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+4656
-954
lines changed

.github/workflows/CI.yml

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,8 @@ name: CI
22
on:
33
pull_request:
44
push:
5-
branches:
6-
- master
7-
tags: '*'
5+
branches: [main, master]
6+
tags: ["*"]
87
jobs:
98
test:
109
name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
@@ -13,34 +12,37 @@ jobs:
1312
fail-fast: false
1413
matrix:
1514
version:
16-
- '1.6'
1715
- '1'
18-
# - 'nightly'
1916
os:
2017
- ubuntu-latest
21-
- macOS-latest
2218
- windows-latest
2319
arch:
2420
- x64
21+
include:
22+
- os: macOS-latest
23+
arch: x64
24+
version: '1'
2525
steps:
26-
- uses: actions/checkout@v2
27-
- uses: julia-actions/setup-julia@v1
26+
- uses: actions/checkout@v4
27+
- uses: julia-actions/setup-julia@v2
2828
with:
2929
version: ${{ matrix.version }}
3030
arch: ${{ matrix.arch }}
31-
- uses: actions/cache@v1
32-
env:
33-
cache-name: cache-artifacts
34-
with:
35-
path: ~/.julia/artifacts
36-
key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }}
37-
restore-keys: |
38-
${{ runner.os }}-test-${{ env.cache-name }}-
39-
${{ runner.os }}-test-
40-
${{ runner.os }}-
31+
- uses: julia-actions/cache@v2
4132
- uses: julia-actions/julia-buildpkg@v1
42-
- uses: julia-actions/julia-runtest@v1
43-
- uses: julia-actions/julia-processcoverage@v1
44-
- uses: codecov/codecov-action@v1
33+
- name: Run tests with coverage
34+
uses: julia-actions/julia-runtest@v1
4535
with:
46-
file: lcov.info
36+
coverage: true
37+
# Option 1: Use Coverage.jl with official uploaders
38+
- name: Process and upload coverage with Coverage.jl
39+
if: matrix.version == '1' && matrix.os == 'ubuntu-latest'
40+
run: |
41+
julia -e '
42+
using Pkg; Pkg.add("Coverage")
43+
using Coverage
44+
process_and_upload(service=:both, folder="src")
45+
'
46+
env:
47+
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
48+
COVERALLS_REPO_TOKEN: ${{ secrets.COVERALLS_REPO_TOKEN }}

.github/workflows/TagBot.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,4 @@ jobs:
1212
- uses: JuliaRegistries/TagBot@v1
1313
with:
1414
token: ${{ secrets.GITHUB_TOKEN }}
15+
ssh: ${{ secrets.DOCUMENTER_KEY }}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
name: Documentation
2+
on:
3+
push:
4+
branches:
5+
- master
6+
- v0.2-dev
7+
tags: '*'
8+
9+
jobs:
10+
build:
11+
permissions:
12+
actions: write
13+
contents: write
14+
pull-requests: read
15+
statuses: write
16+
runs-on: ubuntu-latest
17+
steps:
18+
- uses: actions/checkout@v4
19+
- uses: julia-actions/setup-julia@v2
20+
with:
21+
version: '1'
22+
- uses: julia-actions/cache@v2
23+
- name: Install dependencies
24+
shell: julia --color=yes --project=docs {0}
25+
run: |
26+
using Pkg
27+
Pkg.develop(PackageSpec(path=pwd()))
28+
Pkg.instantiate()
29+
- name: Build and deploy
30+
run: julia --color=yes --project=docs docs/make.jl
31+
env:
32+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
33+
DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,6 @@ deps/deps.jl
66
*.ipynb_checkpoints
77

88
Manifest.toml
9+
Manifest-*.toml
10+
.vscode/settings.json
11+
.DS_Store

PlotMNIST.ipynb

Lines changed: 0 additions & 99 deletions
This file was deleted.

Project.toml

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
11
name = "UMAP"
22
uuid = "c4f8c510-2410-5be4-91d7-4fbaeb39457e"
3+
version = "0.2.0"
34
authors = ["Dillon Daudert <[email protected]>"]
4-
version = "0.1.11"
5+
6+
[workspace]
7+
projects = ["UMAP", "test", "docs"]
58

69
[deps]
10+
Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"
711
Arpack = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97"
812
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
913
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -13,14 +17,9 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
1317
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
1418

1519
[compat]
20+
Accessors = "0.1.43"
1621
Arpack = "0.4, 0.5"
1722
Distances = "0.8, 0.9, 0.10"
1823
LsqFit = "0.6, 0.7, 0.8, 0.9, 0.10, 0.11, 0.12, 0.13, 0.14, 0.15"
1924
NearestNeighborDescent = "0.3"
20-
julia = "1.6"
21-
22-
[extras]
23-
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
24-
25-
[targets]
26-
test = ["Test"]
25+
julia = "1.10, 1.11, 1.12"

README.md

Lines changed: 34 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# UMAP.jl
2-
[![Coverage Status](https://coveralls.io/repos/github/dillondaudert/UMAP.jl/badge.svg?branch=master)](https://coveralls.io/github/dillondaudert/UMAP.jl?branch=master) [![codecov](https://codecov.io/gh/dillondaudert/UMAP.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/dillondaudert/UMAP.jl)
2+
| **Documentation** | **Build Status** | **Test Coverage** |
3+
|:-----------------:|:----------------:|:----------------:|
4+
| [![][docs-stable-img]][docs-stable-url] [![][docs-dev-img]][docs-dev-url] | [![CI](https://github.com/dillondaudert/UMAP.jl/actions/workflows/CI.yml/badge.svg?branch=v0.2-dev)](https://github.com/dillondaudert/UMAP.jl/actions/workflows/CI.yml) | [![][codecov-img]][codecov-url] [![][coveralls-img]][coveralls-url] |
35

46
A pure Julia implementation of the [Uniform Manifold Approximation and Projection](https://arxiv.org/abs/1802.03426) dimension reduction
57
algorithm
@@ -9,74 +11,58 @@ algorithm
911
1012
## Usage
1113
```jl
12-
embedding = umap(X, n_components; n_neighbors, metric, min_dist, ...)
14+
result = UMAP.fit(data, n_components; n_neighbors, metric, ...) -> UMAP.UMAPResult
15+
result.embedding
1316
```
14-
The `umap` function takes two arguments, `X` (a column-major matrix of shape (n_features, n_samples)), `n_components` (the number of dimensions in the output embedding), and various keyword arguments. Several important ones are:
15-
- `n_neighbors::Int=15`: This controls how many neighbors around each point are considered to be part of its local neighborhood. Larger values will result in embeddings that capture more global structure, while smaller values will preserve more local structures.
16-
- `metric::SemiMetric=Euclidean()`: The (semi)metric to use when calculating distances between points. This can be any subtype of the `SemiMetric` type from the `Distances.jl` package, including user-defined types.
17-
- `min_dist::Float=0.1`: This controls the minimum spacing of points in the embedding. Larger values will cause points to be more evenly distributed, while smaller values will preserve more local structure.
17+
The `fit` function takes two arguments, `data` (either a column-major matrix or a vector of "points", e.g. vectors), `n_components` (the number of dimensions in the output embedding), and various keyword arguments. Several important ones are:
18+
- `n_neighbors`: This controls how many neighbors around each point are considered to be part of its local neighborhood. Larger values will result in embeddings that capture more global structure, while smaller values will preserve more local structures.
19+
- `metric`: The distance (semi-)metric to use when calculating distances between points. This can be any subtype of the `SemiMetric` type from the `Distances.jl` package, including user-defined types.
20+
- `min_dist`: This controls the minimum spacing of points in the embedding. Larger values will cause points to be more evenly distributed, while smaller values will preserve more local structure.
1821

19-
The returned `embedding` will be a matrix of shape (n_components, n_samples).
22+
`UMAP.fit` returns a `UMAPResult` struct, with the output embedding at
23+
`result.embedding`.
2024

2125
### Using precomputed distances
22-
UMAP can use a precomputed distance matrix instead of finding the nearest neighbors itself. In this case, the distance matrix is passed as `X` and the `metric` keyword argument should be `:precomputed`. Example:
26+
UMAP can use a precomputed distance matrix instead of finding the nearest neighbors itself. In this case, the distance matrix is passed as `data` and the `metric` keyword argument should be `:precomputed`. Example:
2327

2428
```jl
25-
embedding = umap(distances, n_components; metric=:precomputed)
29+
result = UMAP.fit(distances, n_components; metric=:precomputed)
2630
```
2731

28-
## Fitting a UMAP model to a dataset and transforming new data
32+
### Transforming new data
2933

30-
### Constructing a model
31-
To construct a model to use for embedding new data, use the constructor:
34+
After embedding a dataset, we can transform new points into the same
35+
embedding space via `UMAP.transform`:
3236
```jl
33-
model = UMAP_(X, n_components; <kwargs>)
34-
```
35-
where the constructor takes the same keyword arguments (kwargs) as `umap`. The returned object has the following fields:
36-
```jl
37-
model.graph # The graph of fuzzy simplicial set membership strengths of each point in the dataset
38-
model.embedding # The embedding of the dataset
39-
model.data # A reference to the original dataset
40-
model.knns # A matrix of indices of nearest neighbors of points in the dataset,
41-
# as determined on the original manifold (may be approximate)
42-
model.dists # The distances of the neighbors indicated by model.knns
43-
```
37+
result = UMAP.fit(data, n_component; <kwargs>)
4438

45-
### Embedding new data
46-
To transform new data into the existing embedding of a UMAP model, use the `transform` function:
47-
```jl
48-
Q_embedding = transform(model, Q; <kwargs>)
39+
transform_result = UMAP.transform(result, new_data) -> UMAP.UMAPTransformResult
40+
transform_result.embedding
4941
```
50-
where `Q` is a matrix of new query data to embed into the existing embedding, and `model` is the object obtained from the `UMAP_` call above. `Q` must come from a space of the same dimensionality as `model.data` (ie `X` in the `UMAP_` call above).
5142

52-
The remaining keyword arguments (kwargs) are the same as for above functions.
43+
Note that the type of `new_data` must match the original `data`
44+
exactly. The parameterization used for `fit` is re-used where
45+
appropriate in `transform`, via the `UMAPResult` struct.
5346

54-
## Implementation Details
55-
There are two main steps involved in UMAP: building a weighted graph with edges connecting points to their nearest neighbors, and optimizing the low-dimensional embedding of that graph. The first step is accomplished either by an exact kNN search (for datasets with `< 4096` points) or by the approximate kNN search algorithm, [NNDescent](https://github.com/dillondaudert/NearestNeighborDescent.jl). This step is also usually the most costly.
56-
57-
The low-dimensional embedding is initialized (by default) with the eigenvectors of the normalized Laplacian of the kNN graph. These are found using ARPACK (via [Arpack.jl](https://github.com/JuliaLinearAlgebra/Arpack.jl)).
47+
## Examples
48+
The docs have more examples, e.g.
49+
- [MNIST](https://dillondaudert.github.io/UMAP.jl/dev/examples/mnist/)
50+
- [Advanced Usage](https://dillondaudert.github.io/UMAP.jl/dev/examples/advanced_usage/)
5851

59-
## Current Limitations
60-
- **Input data types**: Only data points that are represented by vectors of numbers (passed in as a matrix) are valid inputs. This is mostly due to a lack of support for other formats in [NNDescent](https://github.com/dillondaudert/NearestNeighborDescent.jl). Support for e.g. string datasets is possible in the future
61-
- **Sequential**: This implementation does not take advantage of any parallelism
6252

6353
## External Resources
6454
- [Understanding UMAP](https://pair-code.github.io/understanding-umap/)
6555
- For a great description of how UMAP works, see [this page](https://umap-learn.readthedocs.io/en/latest/how_umap_works.html) from the Python UMAP documentation
6656
- If you're familiar with [t-SNE](https://lvdmaaten.github.io/tsne/), then [this page](https://jlmelville.github.io/uwot/umap-for-tsne.html) describes UMAP with similar vocabulary to that dimension reduction algorithm
6757

68-
## Examples
69-
The full MNIST and FMNIST datasets are plotted below using both this implementation and the [Python implementation](github.com/lmcinnes/umap) for comparison. These were generated by [this notebook](PlotMNIST.ipynb).
58+
[docs-stable-img]: https://img.shields.io/badge/docs-stable-blue.svg
59+
[docs-stable-url]: https://dillondaudert.github.io/UMAP.jl/stable
7060

71-
Note that the memory allocation for the Python UMAP is unreliable, as Julia's benchmarking doesn't count memory allocated within Python itself.
72-
### MNIST
73-
![Julia MNIST](img/mnist_julia.png)
74-
![Python MNIST](img/mnist_python.png)
61+
[docs-dev-img]: https://img.shields.io/badge/docs-dev-blue.svg
62+
[docs-dev-url]: https://dillondaudert.github.io/UMAP.jl/dev
7563

76-
### FMNIST
77-
![Julia FMNIST](img/fmnist_julia.png)
78-
![Python FMNIST](img/fmnist_python.png)
64+
[codecov-img]: https://codecov.io/gh/dillondaudert/UMAP.jl/branch/v0.2-dev/graph/badge.svg
65+
[codecov-url]: https://codecov.io/gh/dillondaudert/UMAP.jl
7966

80-
## Disclaimer
81-
This implementation is a work-in-progress. If you encounter any issues, please create
82-
an issue or make a pull request.
67+
[coveralls-img]: https://coveralls.io/repos/github/dillondaudert/UMAP.jl/badge.svg?branch=v0.2-dev
68+
[coveralls-url]: https://coveralls.io/github/dillondaudert/UMAP.jl?branch=v0.2-dev

docs/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
build/
2+
.DS_Store

docs/Project.toml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
[deps]
2+
CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
3+
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
4+
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
5+
MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458"
6+
NearestNeighborDescent = "dd2c4c9e-a32f-5b2f-b342-08c2f244fce8"
7+
Pluto = "c3e4b0f8-55cb-11ea-2926-15256bba5781"
8+
PlutoStaticHTML = "359b1769-a58e-495b-9770-312e911026ad"
9+
StringDistances = "88034a9c-02f8-509d-84a9-84ec65e18404"
10+
UMAP = "c4f8c510-2410-5be4-91d7-4fbaeb39457e"

0 commit comments

Comments
 (0)