online-ml · ColdTeapot273K · Aug 23, 2024 · Aug 23, 2024 · Aug 24, 2024 · Aug 24, 2024
@@ -26,3 +26,7 @@
 ## stream
 
 - `stream.iter_arff` now supports blank values (treated as missing values).
+
+## preprocessing
+
+- Add support for expected categories in `preprocessing.OneHotEncoder`, `preprocessing.OrdinalEncoder`, akin to scikit-learn API for respective encoders.
@@ -19,13 +19,27 @@ class OneHotEncoder(base.MiniBatchTransformer):
 
     Parameters
     ----------
+    categories
+        Categories (unique values) per feature:
+            `None` : Determine categories automatically from the training data.
+
+            dict of dicts : Expected categories for each feature. The outer dict maps each feature to its inner dict.
+            The inner dict maps each category to its code.
+
+        The used categories can be found in the `values` attribute.
     drop_zeros
         Whether or not 0s should be made explicit or not.
     drop_first
         Whether to get `k - 1` dummies out of `k` categorical levels by removing the first key.
         This is useful in some statistical models where perfectly collinear features cause
         problems.
 
+    Attributes
+    ----------
+    values
+        A dict of dicts. The outer dict maps each feature to its inner dict. The inner dict maps
+        each category to its code.
+
     Examples
     --------
 
@@ -46,11 +60,11 @@ class OneHotEncoder(base.MiniBatchTransformer):
     ... ]
     >>> pprint(X)
     [{'c1': 'u', 'c2': 'd'},
-        {'c1': 'a', 'c2': 'x'},
-        {'c1': 'i', 'c2': 'h'},
-        {'c1': 'h', 'c2': 'e'}]
+     {'c1': 'a', 'c2': 'x'},
+     {'c1': 'i', 'c2': 'h'},
+     {'c1': 'h', 'c2': 'e'}]
 
-    e can now apply one-hot encoding. All the provided are one-hot encoded, there is therefore
+    We can now apply one-hot encoding. All the provided are one-hot encoded, there is therefore
     no need to specify which features to encode.
 
     >>> from river import preprocessing
@@ -85,6 +99,28 @@ class OneHotEncoder(base.MiniBatchTransformer):
     {'c2_h': 1}
     {'c2_e': 1}
 
+    Like in `scikit-learn`, you can also specify the expected categories manually.
+    This is handy when you want to constrain category encoding space
+    to e.g. top 20% most popular category values you've picked in advance.
+
+    >>> categories = {'c1': {'a', 'h'}, 'c2': {'x', 'e'}}
+    >>> oh = preprocessing.OneHotEncoder(categories=categories)
+    >>> for x in X:
+    ...     oh.learn_one(x)
+    ...     pprint(oh.transform_one(x))
+    {'c1_a': 0, 'c1_h': 0, 'c2_e': 0, 'c2_x': 0}
+    {'c1_a': 1, 'c1_h': 0, 'c2_e': 0, 'c2_x': 1}
+    {'c1_a': 0, 'c1_h': 0, 'c2_e': 0, 'c2_x': 0}
+    {'c1_a': 0, 'c1_h': 1, 'c2_e': 1, 'c2_x': 0}
+
+    >>> for key in sorted(oh.values.keys()):
+    ...     print(key)
+    ...     print(sorted(oh.values[key]))
+    c1
+    ['a', 'h']
+    c2
+    ['e', 'x']
+
     A subset of the features can be one-hot encoded by piping a `compose.Select` into the
     `OneHotEncoder`.
 
@@ -135,6 +171,7 @@ class OneHotEncoder(base.MiniBatchTransformer):
     >>> from pprint import pprint
     >>> import random
     >>> import string
+    >>> import pandas as pd
 
     >>> random.seed(42)
     >>> alphabet = list(string.ascii_lowercase)
@@ -192,23 +229,45 @@ class OneHotEncoder(base.MiniBatchTransformer):
     c2_x          Sparse[uint8, 0]
     dtype: object
 
+    Explicit categories:
+
+    >>> oh = preprocessing.OneHotEncoder(categories=categories)
+
+
+    >>> oh.learn_many(X)
+    >>> df = oh.transform_many(X)
+    >>> df.sort_index(axis="columns")
+       c1_a  c1_h  c2_e  c2_x
+    0     0     0     0     0
+    1     1     0     0     1
+    2     0     0     0     0
+
     """
 
-    def __init__(self, drop_zeros=False, drop_first=False):
+    def __init__(self, categories: dict | None = None, drop_zeros=False, drop_first=False):
         self.drop_zeros = drop_zeros
         self.drop_first = drop_first
-        self.values = collections.defaultdict(set)
+        self.categories = categories
+        self.values: collections.defaultdict | dict | None = None
+
+        if self.categories is None:
+            self.values = collections.defaultdict(set)
+        else:
+            self.values = self.categories
 
     def learn_one(self, x):
         if self.drop_zeros:
             return
 
-        for i, xi in x.items():
-            if isinstance(xi, list) or isinstance(xi, set):
-                for xj in xi:
-                    self.values[i].add(xj)
-            else:
-                self.values[i].add(xi)
+        # NOTE: assume if category mappings are explicitly provided,
+        # they're intended to be kept fixed.
+        if self.categories is None:
+            for i, xi in x.items():
+                if isinstance(xi, list) or isinstance(xi, set):
+                    for xj in xi:
+                        self.values[i].add(xj)
+                else:
+                    self.values[i].add(xi)
 
     def transform_one(self, x, y=None):
         oh = {}
@@ -217,13 +276,25 @@ def transform_one(self, x, y=None):
         if not self.drop_zeros:
             oh = {f"{i}_{v}": 0 for i, values in self.values.items() for v in values}
 
-        # Add 1s
-        for i, xi in x.items():
-            if isinstance(xi, list) or isinstance(xi, set):
-                for xj in xi:
-                    oh[f"{i}_{xj}"] = 1
-            else:
-                oh[f"{i}_{xi}"] = 1
+        # Add 1
+        # NOTE: assume if category mappings are explicitly provided,
+        # no other category values are allowed for output. Aligns with `sklearn` behavior.
+        if self.categories is None:
+            for i, xi in x.items():
+                if isinstance(xi, list) or isinstance(xi, set):
+                    for xj in xi:
+                        oh[f"{i}_{xj}"] = 1
+                else:
+                    oh[f"{i}_{xi}"] = 1
+        else:
+            for i, xi in x.items():
+                if isinstance(xi, list) or isinstance(xi, set):
+                    for xj in xi:
+                        if xj in self.values[i]:
+                            oh[f"{i}_{xj}"] = 1
+                else:
+                    if xi in self.values[i]:
+                        oh[f"{i}_{xi}"] = 1
 
         if self.drop_first:
             oh.pop(min(oh.keys()))
@@ -234,12 +305,22 @@ def learn_many(self, X):
         if self.drop_zeros:
             return
 
-        for col in X.columns:
-            self.values[col].update(X[col].unique())
+        # NOTE: assume if category mappings are explicitly provided,
+        # they're intended to be kept fixed.
+        if self.categories is None:
+            for col in X.columns:
+                self.values[col].update(X[col].unique())
 
     def transform_many(self, X):
         oh = pd.get_dummies(X, columns=X.columns, sparse=True, dtype="uint8")
 
+        # NOTE: assume if category mappings are explicitly provided,
+        # no other category values are allowed for output. Aligns with `sklearn` behavior.
+        if self.categories is not None:
+            seen_in_the_past = {f"{col}_{val}" for col, vals in self.values.items() for val in vals}
+            to_remove = set(oh.columns) - seen_in_the_past
+            oh.drop(columns=list(to_remove), inplace=True)
+
         if not self.drop_zeros:
             seen_in_the_past = {f"{col}_{val}" for col, vals in self.values.items() for val in vals}
             to_add = seen_in_the_past - set(oh.columns)

@@ -22,6 +22,14 @@ class OrdinalEncoder(base.MiniBatchTransformer):
 
     Parameters
     ----------
+    categories
+        Categories (unique values) per feature:
+            `None` : Determine categories automatically from the training data.
+
+            dict of dicts : Expected categories for each feature. The outer dict maps each feature to its inner dict.
+            The inner dict maps each category to its code.
+
+        The used categories can be found in the `values` attribute.
     unknown_value
         The value to use for unknown categories seen during `transform_one`. Unknown categories
         will be mapped to an integer once they are seen during `learn_one`. This value can be set
@@ -31,7 +39,7 @@ class OrdinalEncoder(base.MiniBatchTransformer):
 
     Attributes
     ----------
-    categories
+    values
         A dict of dicts. The outer dict maps each feature to its inner dict. The inner dict maps
         each category to its code.
 
@@ -64,6 +72,26 @@ class OrdinalEncoder(base.MiniBatchTransformer):
     {'country': 2, 'place': 1}
     {'country': -1, 'place': -1}
 
+    Like in `scikit-learn`, you can also specify the expected categories manually.
+    This is handy when you want to constrain category encoding space
+    to e.g. top 20% most popular category values you've picked in advance.
+
+    >>> categories = {'country': {'France': 1},
+    ...               'place': {'Burger King': 2, 'Starbucks': 3}}
+    >>> encoder = preprocessing.OrdinalEncoder(categories=categories)
+    >>> for x in X:
+    ...     print(encoder.transform_one(x))
+    ...     encoder.learn_one(x)
+    {'country': 1, 'place': 0}
+    {'country': -1, 'place': -1}
+    {'country': 0, 'place': 2}
+    {'country': 1, 'place': 2}
+    {'country': 0, 'place': 3}
+    {'country': 0, 'place': 3}
+    {'country': 0, 'place': 0}
+    {'country': -1, 'place': -1}
+
+    >>> import pandas as pd
     >>> xb1 = pd.DataFrame(X[0:4], index=[0, 1, 2, 3])
     >>> xb2 = pd.DataFrame(X[4:8], index=[4, 5, 6, 7])
 
@@ -87,48 +115,54 @@ class OrdinalEncoder(base.MiniBatchTransformer):
 
     def __init__(
         self,
+        categories: dict | None = None,
         unknown_value: int | None = 0,
         none_value: int = -1,
     ):
         self.unknown_value = unknown_value
         self.none_value = none_value
-
-        # We're going to have one auto-incrementing counter per feature. This counter will generate
-        # the category codes for each feature.
-        self._counters: collections.defaultdict = collections.defaultdict(
-            functools.partial(make_counter, {unknown_value, none_value})
-        )
-
-        # We're going to store the categories in a dict of dicts. The outer dict will map each
-        # feature to its inner dict. The inner dict will map each category to its code.
-        self.categories: collections.defaultdict = collections.defaultdict(dict)
+        self.categories = categories
+        self.values: collections.defaultdict | dict | None = None
+
+        if self.categories is None:
+            # We're going to have one auto-incrementing counter per feature. This counter will generate
+            # the category codes for each feature.
+            self._counters: collections.defaultdict = collections.defaultdict(
+                functools.partial(make_counter, {unknown_value, none_value})
+            )
+
+            # We're going to store the categories in a dict of dicts. The outer dict will map each
+            # feature to its inner dict. The inner dict will map each category to its code.
+            self.values = collections.defaultdict(dict)
+        else:
+            self.values = self.categories
 
     def transform_one(self, x):
         return {
-            i: self.none_value if xi is None else self.categories[i].get(xi, self.unknown_value)
+            i: self.none_value if xi is None else self.values[i].get(xi, self.unknown_value)
             for i, xi in x.items()
         }
 
     def learn_one(self, x):
-        for i, xi in x.items():
-            if xi is not None and xi not in self.categories[i]:
-                self.categories[i][xi] = next(self._counters[i])
+        if self.categories is None:
+            for i, xi in x.items():
+                if xi is not None and xi not in self.values[i]:
+                    self.values[i][xi] = next(self._counters[i])
 
     def transform_many(self, X):
         return pd.DataFrame(
             {
                 i: pd.Series(
-                    X[i]
-                    .map({**self.categories[i], None: self.none_value})
-                    .fillna(self.unknown_value),
+                    X[i].map({**self.values[i], None: self.none_value}).fillna(self.unknown_value),
                     dtype=np.int64,
                 )
                 for i in X.columns
             }
         )
 
     def learn_many(self, X, y=None):
-        for i in X.columns:
-            for xi in X[i].dropna().unique():
-                if xi not in self.categories[i]:
-                    self.categories[i][xi] = next(self._counters[i])
+        if self.categories is None:
+            for i in X.columns:
+                for xi in X[i].dropna().unique():
+                    if xi not in self.values[i]:
+                        self.values[i][xi] = next(self._counters[i])