Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/expected categories #1597

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
4 changes: 4 additions & 0 deletions docs/releases/unreleased.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,7 @@
## stream

- `stream.iter_arff` now supports blank values (treated as missing values).

## preprocessing

- Add support for expected categories in `preprocessing.OneHotEncoder`, `preprocessing.OrdinalEncoder`, akin to scikit-learn API for respective encoders.
123 changes: 102 additions & 21 deletions river/preprocessing/one_hot.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,27 @@ class OneHotEncoder(base.MiniBatchTransformer):

Parameters
----------
categories
Categories (unique values) per feature:
`None` : Determine categories automatically from the training data.

dict of dicts : Expected categories for each feature. The outer dict maps each feature to its inner dict.
The inner dict maps each category to its code.

The used categories can be found in the `values` attribute.
drop_zeros
Whether or not 0s should be made explicit or not.
drop_first
Whether to get `k - 1` dummies out of `k` categorical levels by removing the first key.
This is useful in some statistical models where perfectly collinear features cause
problems.

Attributes
----------
values
A dict of dicts. The outer dict maps each feature to its inner dict. The inner dict maps
each category to its code.

Examples
--------

Expand All @@ -46,11 +60,11 @@ class OneHotEncoder(base.MiniBatchTransformer):
... ]
>>> pprint(X)
[{'c1': 'u', 'c2': 'd'},
{'c1': 'a', 'c2': 'x'},
{'c1': 'i', 'c2': 'h'},
{'c1': 'h', 'c2': 'e'}]
{'c1': 'a', 'c2': 'x'},
{'c1': 'i', 'c2': 'h'},
{'c1': 'h', 'c2': 'e'}]

e can now apply one-hot encoding. All the provided are one-hot encoded, there is therefore
We can now apply one-hot encoding. All the provided are one-hot encoded, there is therefore
no need to specify which features to encode.

>>> from river import preprocessing
Expand Down Expand Up @@ -85,6 +99,28 @@ class OneHotEncoder(base.MiniBatchTransformer):
{'c2_h': 1}
{'c2_e': 1}

Like in `scikit-learn`, you can also specify the expected categories manually.
This is handy when you want to constrain category encoding space
to e.g. top 20% most popular category values you've picked in advance.

>>> categories = {'c1': {'a', 'h'}, 'c2': {'x', 'e'}}
>>> oh = preprocessing.OneHotEncoder(categories=categories)
>>> for x in X:
... oh.learn_one(x)
... pprint(oh.transform_one(x))
{'c1_a': 0, 'c1_h': 0, 'c2_e': 0, 'c2_x': 0}
{'c1_a': 1, 'c1_h': 0, 'c2_e': 0, 'c2_x': 1}
{'c1_a': 0, 'c1_h': 0, 'c2_e': 0, 'c2_x': 0}
{'c1_a': 0, 'c1_h': 1, 'c2_e': 1, 'c2_x': 0}

>>> for key in sorted(oh.values.keys()):
... print(key)
... print(sorted(oh.values[key]))
c1
['a', 'h']
c2
['e', 'x']

A subset of the features can be one-hot encoded by piping a `compose.Select` into the
`OneHotEncoder`.

Expand Down Expand Up @@ -135,6 +171,7 @@ class OneHotEncoder(base.MiniBatchTransformer):
>>> from pprint import pprint
>>> import random
>>> import string
>>> import pandas as pd

>>> random.seed(42)
>>> alphabet = list(string.ascii_lowercase)
Expand Down Expand Up @@ -192,23 +229,45 @@ class OneHotEncoder(base.MiniBatchTransformer):
c2_x Sparse[uint8, 0]
dtype: object

Explicit categories:

>>> oh = preprocessing.OneHotEncoder(categories=categories)


>>> oh.learn_many(X)
>>> df = oh.transform_many(X)
>>> df.sort_index(axis="columns")
c1_a c1_h c2_e c2_x
0 0 0 0 0
1 1 0 0 1
2 0 0 0 0

"""

def __init__(self, drop_zeros=False, drop_first=False):
def __init__(self, categories: dict | None = None, drop_zeros=False, drop_first=False):
self.drop_zeros = drop_zeros
self.drop_first = drop_first
self.values = collections.defaultdict(set)
self.categories = categories
self.values: collections.defaultdict | dict | None = None

if self.categories is None:
self.values = collections.defaultdict(set)
else:
self.values = self.categories

def learn_one(self, x):
if self.drop_zeros:
return

for i, xi in x.items():
if isinstance(xi, list) or isinstance(xi, set):
for xj in xi:
self.values[i].add(xj)
else:
self.values[i].add(xi)
# NOTE: assume if category mappings are explicitly provided,
# they're intended to be kept fixed.
if self.categories is None:
for i, xi in x.items():
if isinstance(xi, list) or isinstance(xi, set):
for xj in xi:
self.values[i].add(xj)
else:
self.values[i].add(xi)

def transform_one(self, x, y=None):
oh = {}
Expand All @@ -217,13 +276,25 @@ def transform_one(self, x, y=None):
if not self.drop_zeros:
oh = {f"{i}_{v}": 0 for i, values in self.values.items() for v in values}

# Add 1s
for i, xi in x.items():
if isinstance(xi, list) or isinstance(xi, set):
for xj in xi:
oh[f"{i}_{xj}"] = 1
else:
oh[f"{i}_{xi}"] = 1
# Add 1
# NOTE: assume if category mappings are explicitly provided,
# no other category values are allowed for output. Aligns with `sklearn` behavior.
if self.categories is None:
for i, xi in x.items():
if isinstance(xi, list) or isinstance(xi, set):
for xj in xi:
oh[f"{i}_{xj}"] = 1
else:
oh[f"{i}_{xi}"] = 1
else:
for i, xi in x.items():
if isinstance(xi, list) or isinstance(xi, set):
for xj in xi:
if xj in self.values[i]:
oh[f"{i}_{xj}"] = 1
else:
if xi in self.values[i]:
oh[f"{i}_{xi}"] = 1

if self.drop_first:
oh.pop(min(oh.keys()))
Expand All @@ -234,12 +305,22 @@ def learn_many(self, X):
if self.drop_zeros:
return

for col in X.columns:
self.values[col].update(X[col].unique())
# NOTE: assume if category mappings are explicitly provided,
# they're intended to be kept fixed.
if self.categories is None:
for col in X.columns:
self.values[col].update(X[col].unique())

def transform_many(self, X):
oh = pd.get_dummies(X, columns=X.columns, sparse=True, dtype="uint8")

# NOTE: assume if category mappings are explicitly provided,
# no other category values are allowed for output. Aligns with `sklearn` behavior.
if self.categories is not None:
seen_in_the_past = {f"{col}_{val}" for col, vals in self.values.items() for val in vals}
to_remove = set(oh.columns) - seen_in_the_past
oh.drop(columns=list(to_remove), inplace=True)

if not self.drop_zeros:
seen_in_the_past = {f"{col}_{val}" for col, vals in self.values.items() for val in vals}
to_add = seen_in_the_past - set(oh.columns)
Expand Down
78 changes: 56 additions & 22 deletions river/preprocessing/ordinal.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,14 @@ class OrdinalEncoder(base.MiniBatchTransformer):

Parameters
----------
categories
Categories (unique values) per feature:
`None` : Determine categories automatically from the training data.

dict of dicts : Expected categories for each feature. The outer dict maps each feature to its inner dict.
The inner dict maps each category to its code.

The used categories can be found in the `values` attribute.
unknown_value
The value to use for unknown categories seen during `transform_one`. Unknown categories
will be mapped to an integer once they are seen during `learn_one`. This value can be set
Expand All @@ -31,7 +39,7 @@ class OrdinalEncoder(base.MiniBatchTransformer):

Attributes
----------
categories
values
A dict of dicts. The outer dict maps each feature to its inner dict. The inner dict maps
each category to its code.

Expand Down Expand Up @@ -64,6 +72,26 @@ class OrdinalEncoder(base.MiniBatchTransformer):
{'country': 2, 'place': 1}
{'country': -1, 'place': -1}

Like in `scikit-learn`, you can also specify the expected categories manually.
This is handy when you want to constrain category encoding space
to e.g. top 20% most popular category values you've picked in advance.

>>> categories = {'country': {'France': 1},
... 'place': {'Burger King': 2, 'Starbucks': 3}}
>>> encoder = preprocessing.OrdinalEncoder(categories=categories)
>>> for x in X:
... print(encoder.transform_one(x))
... encoder.learn_one(x)
{'country': 1, 'place': 0}
{'country': -1, 'place': -1}
{'country': 0, 'place': 2}
{'country': 1, 'place': 2}
{'country': 0, 'place': 3}
{'country': 0, 'place': 3}
{'country': 0, 'place': 0}
{'country': -1, 'place': -1}

>>> import pandas as pd
>>> xb1 = pd.DataFrame(X[0:4], index=[0, 1, 2, 3])
>>> xb2 = pd.DataFrame(X[4:8], index=[4, 5, 6, 7])

Expand All @@ -87,48 +115,54 @@ class OrdinalEncoder(base.MiniBatchTransformer):

def __init__(
self,
categories: dict | None = None,
unknown_value: int | None = 0,
none_value: int = -1,
):
self.unknown_value = unknown_value
self.none_value = none_value

# We're going to have one auto-incrementing counter per feature. This counter will generate
# the category codes for each feature.
self._counters: collections.defaultdict = collections.defaultdict(
functools.partial(make_counter, {unknown_value, none_value})
)

# We're going to store the categories in a dict of dicts. The outer dict will map each
# feature to its inner dict. The inner dict will map each category to its code.
self.categories: collections.defaultdict = collections.defaultdict(dict)
self.categories = categories
self.values: collections.defaultdict | dict | None = None

if self.categories is None:
# We're going to have one auto-incrementing counter per feature. This counter will generate
# the category codes for each feature.
self._counters: collections.defaultdict = collections.defaultdict(
functools.partial(make_counter, {unknown_value, none_value})
)

# We're going to store the categories in a dict of dicts. The outer dict will map each
# feature to its inner dict. The inner dict will map each category to its code.
self.values = collections.defaultdict(dict)
else:
self.values = self.categories

def transform_one(self, x):
return {
i: self.none_value if xi is None else self.categories[i].get(xi, self.unknown_value)
i: self.none_value if xi is None else self.values[i].get(xi, self.unknown_value)
for i, xi in x.items()
}

def learn_one(self, x):
for i, xi in x.items():
if xi is not None and xi not in self.categories[i]:
self.categories[i][xi] = next(self._counters[i])
if self.categories is None:
for i, xi in x.items():
if xi is not None and xi not in self.values[i]:
self.values[i][xi] = next(self._counters[i])

def transform_many(self, X):
return pd.DataFrame(
{
i: pd.Series(
X[i]
.map({**self.categories[i], None: self.none_value})
.fillna(self.unknown_value),
X[i].map({**self.values[i], None: self.none_value}).fillna(self.unknown_value),
dtype=np.int64,
)
for i in X.columns
}
)

def learn_many(self, X, y=None):
for i in X.columns:
for xi in X[i].dropna().unique():
if xi not in self.categories[i]:
self.categories[i][xi] = next(self._counters[i])
if self.categories is None:
for i in X.columns:
for xi in X[i].dropna().unique():
if xi not in self.values[i]:
self.values[i][xi] = next(self._counters[i])
Loading