Skip to content

Commit d2390e6

Browse files
committed
python/ Improved apply_within_sets().
1 parent 1e9f854 commit d2390e6

File tree

5 files changed

+136
-71
lines changed

5 files changed

+136
-71
lines changed

Diff for: python/doc/source/whatsnew/index.rst

+13-6
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,26 @@
44
What's new
55
**********
66

7+
Version 0.1.4
8+
-------------
9+
10+
- Improved example :ref:`sphx_glr_auto_examples_plot_pandas.py` to work in Pandas version >= 2.
11+
- Changed behavior of :func:`~moocore.apply_within_sets`. The previous behavior could lead to subtle bugs.
12+
13+
714
Version 0.1.3 (28/10/2024)
815
--------------------------
916

10-
- :class:`Hypervolume`: Object-oriented API for hypervolume indicator.
11-
- :func:`apply_within_sets()`: Utility function to apply operations to individual datasets.
12-
- :func:`is_nondominated_within_sets()`: Utility function to identify nondominated points within sets.
13-
- Fix bug in :func:`normalise()` when the input is :class:`pandas.DataFrame` or some other non-contiguous array.
14-
- New example using :class:`pandas.DataFrame`.
17+
- New: :class:`~moocore.Hypervolume`: Object-oriented API for hypervolume indicator.
18+
- New: :func:`~moocore.apply_within_sets()`: Utility function to apply operations to individual datasets.
19+
- New: :func:`~moocore.is_nondominated_within_sets()`: Utility function to identify nondominated points within sets.
20+
- New example using :class:`pandas.DataFrame` in :ref:`sphx_glr_auto_examples_plot_pandas.py`.
21+
- Fix bug in :func:`~moocore.normalise` when the input is :class:`pandas.DataFrame` or some other non-contiguous array.
1522

1623

1724
Version 0.1.2 (18/09/2024)
1825
--------------------------
1926

20-
- :func:`hv_approx()`: New function.
27+
- New: :func:`~moocore.hv_approx()`
2128
- Documentation improvements.
2229
- New gallery examples.

Diff for: python/examples/plot_pandas.py

-2
Original file line numberDiff line numberDiff line change
@@ -97,5 +97,3 @@
9797
# And use the boolean vector above to filter rows:
9898
#
9999
df[is_nondom]
100-
101-
# %%

Diff for: python/src/moocore/_moocore.py

+119-36
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22

33
import os
44
from io import StringIO
5-
import numpy as np
5+
from collections.abc import Callable
66
from numpy.typing import ArrayLike # For type hints
7-
from typing import Literal
7+
from typing import Literal, Any
88

99
from math import gamma as gamma_function
1010
# NOTE: if we ever start using SciPy, we can use
@@ -13,13 +13,13 @@
1313
import lzma
1414
import shutil
1515
import tempfile
16-
1716
from importlib.resources import files
1817

18+
import numpy as np
19+
1920
from ._utils import (
2021
asarray_maybe_copy,
2122
unique_nosort,
22-
groupby,
2323
np2d_to_double_array,
2424
np1d_to_double_array,
2525
np1d_to_int_array,
@@ -711,8 +711,10 @@ def is_nondominated_within_sets(
711711
) -> np.ndarray:
712712
r"""Identify dominated points according to Pareto optimality within each set.
713713
714-
Executes the :func:`is_nondominated` function within each set in a dataset \
715-
and returns back a 1D array of booleans.
714+
Executes the :func:`is_nondominated` function within each set in a dataset
715+
\ and returns back a 1D array of booleans. This is equivalent to
716+
``apply_within_sets(data, sets, is_nondominated, ...)`` but slightly
717+
faster.
716718
717719
Parameters
718720
----------
@@ -735,23 +737,29 @@ def is_nondominated_within_sets(
735737
736738
See Also
737739
--------
738-
filter_dominated_within_sets : to filter out dominated points.
740+
filter_dominated_within_sets : filter out dominated points.
741+
apply_within_sets : a more general way to apply any function to each set.
739742
740743
Examples
741744
--------
745+
>>> x = np.array([[1, 2, 1], [1, 3, 1], [2, 1, 1], [2, 2, 2]])
746+
>>> moocore.is_nondominated_within_sets(x[:, :-1], x[:, -1])
747+
array([ True, False, True, True])
742748
>>> x = moocore.get_dataset("input1.dat")
743-
>>> nondom_per_set = moocore.is_nondominated_within_sets(x[:, :-1], x[:, -1])
749+
>>> nondom_per_set = moocore.is_nondominated_within_sets(
750+
... x[:, :-1], x[:, -1]
751+
... )
744752
>>> len(nondom_per_set)
745753
100
746-
>>> nondom_per_set # doctest: +ELLIPSIS
754+
>>> nondom_per_set # doctest: +ELLIPSIS
747755
array([False, False, True, False, True, False, False, False, False,
748756
True, False, True, True, True, False, True, True, True,
749757
False, True, False, False, False, False, True, False, True,
750758
...
751759
True, True, True, False, True, False, True, True, False,
752760
True, False, False, True, True, False, False, False, False,
753761
False])
754-
>>> x[nondom_per_set, :] # doctest: +ELLIPSIS
762+
>>> x[nondom_per_set, :] # doctest: +ELLIPSIS
755763
array([[ 0.20816431, 4.62275469, 1. ],
756764
[ 0.22997367, 1.11772205, 1. ],
757765
[ 0.58799475, 0.73891181, 1. ],
@@ -771,18 +779,22 @@ def is_nondominated_within_sets(
771779
if ncols < 2:
772780
raise ValueError("'data' must have at least 2 columns (2 objectives)")
773781

774-
is_nondom = np.concatenate(
775-
apply_within_sets(
776-
data,
777-
sets,
778-
is_nondominated,
779-
maximise=maximise,
780-
keep_weakly=keep_weakly,
781-
),
782-
dtype=bool,
783-
casting="no",
782+
# FIXME: How can we make this faster?
783+
_, idx, inv = np.unique(sets, return_index=True, return_inverse=True)
784+
# Remember the original position of each element of each set.
785+
idx = [np.flatnonzero(inv == i) for i in idx.argsort()]
786+
data = np.concatenate(
787+
[
788+
is_nondominated(
789+
data.take(g_idx, axis=0),
790+
maximise=maximise,
791+
keep_weakly=keep_weakly,
792+
)
793+
for g_idx in idx
794+
]
784795
)
785-
return is_nondom
796+
idx = np.concatenate(idx).argsort()
797+
return data.take(idx, axis=0)
786798

787799

788800
def filter_dominated(
@@ -822,12 +834,12 @@ def filter_dominated_within_sets(
822834
Either a single boolean value that applies to all objectives or a list of booleans, with one value per objective. \
823835
Also accepts a 1D numpy array with values 0 or 1 for each objective
824836
keep_weakly :
825-
If ``False``, return ``False`` for any duplicates of nondominated points.
837+
If ``False``, do not delete duplicates of nondominated points.
826838
827839
Returns
828840
-------
829841
A numpy array where each set only contains nondominated points with respect to the set (last column is the set index).
830-
Points from one set can still dominated points from another set.
842+
Points from one set can still dominate points from another set.
831843
832844
Examples
833845
--------
@@ -1893,10 +1905,10 @@ def get_dataset(filename: str, /) -> np.ndarray:
18931905
return read_datasets(get_dataset_path(filename))
18941906

18951907

1896-
def apply_within_sets(x: ArrayLike, sets: ArrayLike, func, **kwargs):
1897-
"""Split ``x`` by row according to ``sets`` and apply ``fun`` to each row.
1898-
1899-
See https://github.com/numpy/numpy/issues/7265
1908+
def apply_within_sets(
1909+
x: ArrayLike, sets: ArrayLike, func: Callable[..., Any], **kwargs
1910+
) -> np.ndarray:
1911+
"""Split ``x`` by row according to ``sets`` and apply ``func`` to each sub-array.
19001912
19011913
Parameters
19021914
----------
@@ -1905,20 +1917,91 @@ def apply_within_sets(x: ArrayLike, sets: ArrayLike, func, **kwargs):
19051917
sets :
19061918
A list or 1D array of length equal to the number of rows of ``x``. The values are used as-is to determine the groups and do not need to be sorted.
19071919
func :
1908-
A function that can take a 2D array as input.
1920+
A function that can take a 2D array as input. This function may return (1) a 2D array with the same number of rows as the input,
1921+
(2) a 1D array as long as the number of input rows,
1922+
(3) a scalar value, or
1923+
(4) a 2D array with a single row.
1924+
19091925
kwargs :
19101926
Additional keyword arguments to ``func``.
19111927
19121928
Returns
19131929
-------
1914-
An array.
1930+
An array whose shape depends on the output of ``func``. See Examples below.
1931+
1932+
See Also
1933+
--------
1934+
is_nondominated_within_sets, filter_dominated_within_sets
1935+
1936+
Examples
1937+
--------
1938+
>>> sets = np.array([3, 1, 2, 4, 2, 3, 1])
1939+
>>> x = np.arange(len(sets) * 2).reshape(-1, 2)
1940+
>>> x = np.hstack((x, sets.reshape(-1, 1)))
1941+
1942+
If ``func`` returns an array with the same number of rows as the input (case 1),
1943+
then the output is ordered in exactly the same way as the input.
1944+
1945+
>>> moocore.apply_within_sets(x, sets, lambda x: x)
1946+
array([[ 0, 1, 3],
1947+
[ 2, 3, 1],
1948+
[ 4, 5, 2],
1949+
[ 6, 7, 4],
1950+
[ 8, 9, 2],
1951+
[10, 11, 3],
1952+
[12, 13, 1]])
1953+
1954+
This is also the behavior if ``func`` returns a 1D array with one value per input row (case 2).
1955+
1956+
>>> moocore.apply_within_sets(x, sets, lambda x: x.sum(axis=1))
1957+
array([ 4, 6, 11, 17, 19, 24, 26])
1958+
1959+
If ``func`` returns a single scalar (case 3) or a 2D array with a single row (case 4),
1960+
then the order of the output is the order of the unique values as found in
1961+
``sets``, without sorting the unique values, which is what
1962+
:meth:`pandas.Series.unique` returns and NOT what :func:`numpy.unique`
1963+
returns.
1964+
1965+
>>> moocore.apply_within_sets(x, sets, lambda x: x.max())
1966+
array([11, 13, 9, 7])
1967+
1968+
>>> moocore.apply_within_sets(x, sets, lambda x: [x.max(axis=0)])
1969+
array([[10, 11, 3],
1970+
[12, 13, 1],
1971+
[ 8, 9, 2],
1972+
[ 6, 7, 4]])
1973+
1974+
In the previous example, ``func`` returns a 2D array with a single row. The
1975+
following will produce an error because it returns a 1D array, which is
1976+
interpreted as case 2, but the number of values does not match the number
1977+
of input rows.
1978+
1979+
>>> moocore.apply_within_sets(
1980+
... x, sets, lambda x: x.max(axis=0)
1981+
... ) # doctest: +ELLIPSIS
1982+
Traceback (most recent call last):
1983+
...
1984+
ValueError: `func` returned an array of length 3 but the input has length 2 for rows [0 5]
19151985
19161986
"""
19171987
x = np.asarray(x)
1918-
sets = np.asarray(sets)
1919-
if x.shape[0] != sets.shape[0]:
1920-
raise ValueError(
1921-
f"'x' and 'sets' must have the same length ({x.shape[0]} != {sets.shape[0]})"
1922-
)
1923-
1924-
return [func(g, **kwargs) for g in groupby(x, sets)]
1988+
_, idx, inv = np.unique(sets, return_index=True, return_inverse=True)
1989+
# Remember the original position of each element of each set.
1990+
idx = [np.flatnonzero(inv == i) for i in idx.argsort()]
1991+
res = []
1992+
shorter = False
1993+
for g_idx in idx:
1994+
z = func(x.take(g_idx, axis=0), **kwargs)
1995+
z = np.atleast_1d(z)
1996+
if len(z) != len(g_idx):
1997+
if len(z) != 1:
1998+
raise ValueError(
1999+
f"`func` returned an array of length {len(z)} but the input has length {len(g_idx)} for rows {g_idx}"
2000+
)
2001+
shorter = True
2002+
res.append(z)
2003+
2004+
res = np.concatenate(res)
2005+
if not shorter:
2006+
res = res.take(np.concatenate(idx).argsort(), axis=0)
2007+
return res

Diff for: python/src/moocore/_utils.py

+3-23
Original file line numberDiff line numberDiff line change
@@ -13,33 +13,13 @@ def asarray_maybe_copy(x, dtype=float):
1313

1414

1515
def unique_nosort(array, **kwargs):
16-
uniq, index = np.unique(array, return_index=True, **kwargs)
17-
return uniq[index.argsort()]
18-
19-
20-
def groupby(x, groups, /, *, axis: int = 0):
21-
"""Split an array into groups.
16+
"""Return unique values without sorting them.
2217
2318
See https://github.com/numpy/numpy/issues/7265
2419
25-
Parameters
26-
----------
27-
x : ndarray
28-
Array to be divided into sub-arrays.
29-
groups : 1-D array
30-
A ndarray of length equal to the selected `axis`. The values are used as-is to determine the groups and do not need to be sorted.
31-
axis :
32-
The axis along which to split, default is 0.
33-
34-
Yields
35-
------
36-
sub-array : ndarray
37-
Sub-arrays of `x`.
38-
3920
"""
40-
index = unique_nosort(groups)
41-
for g in index:
42-
yield x.compress(g == groups, axis=axis)
21+
uniq, index = np.unique(array, return_index=True, **kwargs)
22+
return uniq[index.argsort()]
4323

4424

4525
def np2d_to_double_array(x):

Diff for: python/tests/test_pandas.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
# ruff: noqa: D100, D101, D102, D103
22
import pytest
33
import moocore
4-
import numpy as np
54
from numpy.testing import assert_array_equal, assert_allclose
65

76
pd = pytest.importorskip("pandas")
@@ -80,7 +79,5 @@ def test_example_pandas():
8079
is_nondom = moocore.is_nondominated_within_sets(df[obj_cols], sets=sets)
8180
assert_array_equal(
8281
is_nondom,
83-
np.array(
84-
[True, True, False, True, False, True, True, True, True, True]
85-
),
82+
[True, True, True, True, True, True, False, True, False, True],
8683
)

0 commit comments

Comments
 (0)