2
2
3
3
import os
4
4
from io import StringIO
5
- import numpy as np
5
+ from collections . abc import Callable
6
6
from numpy .typing import ArrayLike # For type hints
7
- from typing import Literal
7
+ from typing import Literal , Any
8
8
9
9
from math import gamma as gamma_function
10
10
# NOTE: if we ever start using SciPy, we can use
13
13
import lzma
14
14
import shutil
15
15
import tempfile
16
-
17
16
from importlib .resources import files
18
17
18
+ import numpy as np
19
+
19
20
from ._utils import (
20
21
asarray_maybe_copy ,
21
22
unique_nosort ,
22
- groupby ,
23
23
np2d_to_double_array ,
24
24
np1d_to_double_array ,
25
25
np1d_to_int_array ,
@@ -711,8 +711,10 @@ def is_nondominated_within_sets(
711
711
) -> np .ndarray :
712
712
r"""Identify dominated points according to Pareto optimality within each set.
713
713
714
- Executes the :func:`is_nondominated` function within each set in a dataset \
715
- and returns back a 1D array of booleans.
714
+ Executes the :func:`is_nondominated` function within each set in a dataset
715
+ \ and returns back a 1D array of booleans. This is equivalent to
716
+ ``apply_within_sets(data, sets, is_nondominated, ...)`` but slightly
717
+ faster.
716
718
717
719
Parameters
718
720
----------
@@ -735,23 +737,29 @@ def is_nondominated_within_sets(
735
737
736
738
See Also
737
739
--------
738
- filter_dominated_within_sets : to filter out dominated points.
740
+ filter_dominated_within_sets : filter out dominated points.
741
+ apply_within_sets : a more general way to apply any function to each set.
739
742
740
743
Examples
741
744
--------
745
+ >>> x = np.array([[1, 2, 1], [1, 3, 1], [2, 1, 1], [2, 2, 2]])
746
+ >>> moocore.is_nondominated_within_sets(x[:, :-1], x[:, -1])
747
+ array([ True, False, True, True])
742
748
>>> x = moocore.get_dataset("input1.dat")
743
- >>> nondom_per_set = moocore.is_nondominated_within_sets(x[:, :-1], x[:, -1])
749
+ >>> nondom_per_set = moocore.is_nondominated_within_sets(
750
+ ... x[:, :-1], x[:, -1]
751
+ ... )
744
752
>>> len(nondom_per_set)
745
753
100
746
- >>> nondom_per_set # doctest: +ELLIPSIS
754
+ >>> nondom_per_set # doctest: +ELLIPSIS
747
755
array([False, False, True, False, True, False, False, False, False,
748
756
True, False, True, True, True, False, True, True, True,
749
757
False, True, False, False, False, False, True, False, True,
750
758
...
751
759
True, True, True, False, True, False, True, True, False,
752
760
True, False, False, True, True, False, False, False, False,
753
761
False])
754
- >>> x[nondom_per_set, :] # doctest: +ELLIPSIS
762
+ >>> x[nondom_per_set, :] # doctest: +ELLIPSIS
755
763
array([[ 0.20816431, 4.62275469, 1. ],
756
764
[ 0.22997367, 1.11772205, 1. ],
757
765
[ 0.58799475, 0.73891181, 1. ],
@@ -771,18 +779,22 @@ def is_nondominated_within_sets(
771
779
if ncols < 2 :
772
780
raise ValueError ("'data' must have at least 2 columns (2 objectives)" )
773
781
774
- is_nondom = np .concatenate (
775
- apply_within_sets (
776
- data ,
777
- sets ,
778
- is_nondominated ,
779
- maximise = maximise ,
780
- keep_weakly = keep_weakly ,
781
- ),
782
- dtype = bool ,
783
- casting = "no" ,
782
+ # FIXME: How can we make this faster?
783
+ _ , idx , inv = np .unique (sets , return_index = True , return_inverse = True )
784
+ # Remember the original position of each element of each set.
785
+ idx = [np .flatnonzero (inv == i ) for i in idx .argsort ()]
786
+ data = np .concatenate (
787
+ [
788
+ is_nondominated (
789
+ data .take (g_idx , axis = 0 ),
790
+ maximise = maximise ,
791
+ keep_weakly = keep_weakly ,
792
+ )
793
+ for g_idx in idx
794
+ ]
784
795
)
785
- return is_nondom
796
+ idx = np .concatenate (idx ).argsort ()
797
+ return data .take (idx , axis = 0 )
786
798
787
799
788
800
def filter_dominated (
@@ -822,12 +834,12 @@ def filter_dominated_within_sets(
822
834
Either a single boolean value that applies to all objectives or a list of booleans, with one value per objective. \
823
835
Also accepts a 1D numpy array with values 0 or 1 for each objective
824
836
keep_weakly :
825
- If ``False``, return ``False`` for any duplicates of nondominated points.
837
+ If ``False``, do not delete duplicates of nondominated points.
826
838
827
839
Returns
828
840
-------
829
841
A numpy array where each set only contains nondominated points with respect to the set (last column is the set index).
830
- Points from one set can still dominated points from another set.
842
+ Points from one set can still dominate points from another set.
831
843
832
844
Examples
833
845
--------
@@ -1893,10 +1905,10 @@ def get_dataset(filename: str, /) -> np.ndarray:
1893
1905
return read_datasets (get_dataset_path (filename ))
1894
1906
1895
1907
1896
- def apply_within_sets (x : ArrayLike , sets : ArrayLike , func , ** kwargs ):
1897
- """Split ``x`` by row according to `` sets`` and apply ``fun`` to each row.
1898
-
1899
- See https://github.com/numpy/numpy/issues/7265
1908
+ def apply_within_sets (
1909
+ x : ArrayLike , sets : ArrayLike , func : Callable [..., Any ], ** kwargs
1910
+ ) -> np . ndarray :
1911
+ """Split ``x`` by row according to ``sets`` and apply ``func`` to each sub-array.
1900
1912
1901
1913
Parameters
1902
1914
----------
@@ -1905,20 +1917,91 @@ def apply_within_sets(x: ArrayLike, sets: ArrayLike, func, **kwargs):
1905
1917
sets :
1906
1918
A list or 1D array of length equal to the number of rows of ``x``. The values are used as-is to determine the groups and do not need to be sorted.
1907
1919
func :
1908
- A function that can take a 2D array as input.
1920
+ A function that can take a 2D array as input. This function may return (1) a 2D array with the same number of rows as the input,
1921
+ (2) a 1D array as long as the number of input rows,
1922
+ (3) a scalar value, or
1923
+ (4) a 2D array with a single row.
1924
+
1909
1925
kwargs :
1910
1926
Additional keyword arguments to ``func``.
1911
1927
1912
1928
Returns
1913
1929
-------
1914
- An array.
1930
+ An array whose shape depends on the output of ``func``. See Examples below.
1931
+
1932
+ See Also
1933
+ --------
1934
+ is_nondominated_within_sets, filter_dominated_within_sets
1935
+
1936
+ Examples
1937
+ --------
1938
+ >>> sets = np.array([3, 1, 2, 4, 2, 3, 1])
1939
+ >>> x = np.arange(len(sets) * 2).reshape(-1, 2)
1940
+ >>> x = np.hstack((x, sets.reshape(-1, 1)))
1941
+
1942
+ If ``func`` returns an array with the same number of rows as the input (case 1),
1943
+ then the output is ordered in exactly the same way as the input.
1944
+
1945
+ >>> moocore.apply_within_sets(x, sets, lambda x: x)
1946
+ array([[ 0, 1, 3],
1947
+ [ 2, 3, 1],
1948
+ [ 4, 5, 2],
1949
+ [ 6, 7, 4],
1950
+ [ 8, 9, 2],
1951
+ [10, 11, 3],
1952
+ [12, 13, 1]])
1953
+
1954
+ This is also the behavior if ``func`` returns a 1D array with one value per input row (case 2).
1955
+
1956
+ >>> moocore.apply_within_sets(x, sets, lambda x: x.sum(axis=1))
1957
+ array([ 4, 6, 11, 17, 19, 24, 26])
1958
+
1959
+ If ``func`` returns a single scalar (case 3) or a 2D array with a single row (case 4),
1960
+ then the order of the output is the order of the unique values as found in
1961
+ ``sets``, without sorting the unique values, which is what
1962
+ :meth:`pandas.Series.unique` returns and NOT what :func:`numpy.unique`
1963
+ returns.
1964
+
1965
+ >>> moocore.apply_within_sets(x, sets, lambda x: x.max())
1966
+ array([11, 13, 9, 7])
1967
+
1968
+ >>> moocore.apply_within_sets(x, sets, lambda x: [x.max(axis=0)])
1969
+ array([[10, 11, 3],
1970
+ [12, 13, 1],
1971
+ [ 8, 9, 2],
1972
+ [ 6, 7, 4]])
1973
+
1974
+ In the previous example, ``func`` returns a 2D array with a single row. The
1975
+ following will produce an error because it returns a 1D array, which is
1976
+ interpreted as case 2, but the number of values does not match the number
1977
+ of input rows.
1978
+
1979
+ >>> moocore.apply_within_sets(
1980
+ ... x, sets, lambda x: x.max(axis=0)
1981
+ ... ) # doctest: +ELLIPSIS
1982
+ Traceback (most recent call last):
1983
+ ...
1984
+ ValueError: `func` returned an array of length 3 but the input has length 2 for rows [0 5]
1915
1985
1916
1986
"""
1917
1987
x = np .asarray (x )
1918
- sets = np .asarray (sets )
1919
- if x .shape [0 ] != sets .shape [0 ]:
1920
- raise ValueError (
1921
- f"'x' and 'sets' must have the same length ({ x .shape [0 ]} != { sets .shape [0 ]} )"
1922
- )
1923
-
1924
- return [func (g , ** kwargs ) for g in groupby (x , sets )]
1988
+ _ , idx , inv = np .unique (sets , return_index = True , return_inverse = True )
1989
+ # Remember the original position of each element of each set.
1990
+ idx = [np .flatnonzero (inv == i ) for i in idx .argsort ()]
1991
+ res = []
1992
+ shorter = False
1993
+ for g_idx in idx :
1994
+ z = func (x .take (g_idx , axis = 0 ), ** kwargs )
1995
+ z = np .atleast_1d (z )
1996
+ if len (z ) != len (g_idx ):
1997
+ if len (z ) != 1 :
1998
+ raise ValueError (
1999
+ f"`func` returned an array of length { len (z )} but the input has length { len (g_idx )} for rows { g_idx } "
2000
+ )
2001
+ shorter = True
2002
+ res .append (z )
2003
+
2004
+ res = np .concatenate (res )
2005
+ if not shorter :
2006
+ res = res .take (np .concatenate (idx ).argsort (), axis = 0 )
2007
+ return res
0 commit comments