Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/expected categories #1597

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
107 changes: 86 additions & 21 deletions river/preprocessing/one_hot.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,11 @@ class OneHotEncoder(base.MiniBatchTransformer):
... ]
>>> pprint(X)
[{'c1': 'u', 'c2': 'd'},
{'c1': 'a', 'c2': 'x'},
{'c1': 'i', 'c2': 'h'},
{'c1': 'h', 'c2': 'e'}]
{'c1': 'a', 'c2': 'x'},
{'c1': 'i', 'c2': 'h'},
{'c1': 'h', 'c2': 'e'}]

e can now apply one-hot encoding. All the provided are one-hot encoded, there is therefore
We can now apply one-hot encoding. All the provided are one-hot encoded, there is therefore
no need to specify which features to encode.

>>> from river import preprocessing
Expand Down Expand Up @@ -85,6 +85,29 @@ class OneHotEncoder(base.MiniBatchTransformer):
{'c2_h': 1}
{'c2_e': 1}

Like in `scikit-learn`, you can also specify the expected categories manually.
This is handy when you want to constrain category encoding space
to e.g. top 20% most popular category values you've picked in advance.

>>> categories = {'c1': {'a', 'h'}, 'c2': {'x', 'e'}}
>>> oh = preprocessing.OneHotEncoder(categories=categories)
>>> # oh = preprocessing.OneHotEncoder()
ColdTeapot273K marked this conversation as resolved.
Show resolved Hide resolved
>>> for x in X:
... oh.learn_one(x)
... pprint(oh.transform_one(x))
{'c1_a': 0, 'c1_h': 0, 'c2_e': 0, 'c2_x': 0}
{'c1_a': 1, 'c1_h': 0, 'c2_e': 0, 'c2_x': 1}
{'c1_a': 0, 'c1_h': 0, 'c2_e': 0, 'c2_x': 0}
{'c1_a': 0, 'c1_h': 1, 'c2_e': 1, 'c2_x': 0}

>>> for key in sorted(oh.values.keys()):
... print(key)
... print(sorted(oh.values[key]))
c1
['a', 'h']
c2
['e', 'x']

A subset of the features can be one-hot encoded by piping a `compose.Select` into the
`OneHotEncoder`.

Expand Down Expand Up @@ -192,23 +215,43 @@ class OneHotEncoder(base.MiniBatchTransformer):
c2_x Sparse[uint8, 0]
dtype: object

Explicit categories:

>>> oh = preprocessing.OneHotEncoder(categories=categories)

# oh = preprocessing.OneHotEncoder()
>>> oh.learn_many(X)
>>> df = oh.transform_many(X)
>>> df.sort_index(axis="columns")
c1_a c1_h c2_e c2_x
0 0 0 0 0
1 1 0 0 1
2 0 0 0 0
"""
ColdTeapot273K marked this conversation as resolved.
Show resolved Hide resolved

def __init__(self, drop_zeros=False, drop_first=False):
def __init__(self, categories = "auto", drop_zeros=False, drop_first=False):
self.drop_zeros = drop_zeros
self.drop_first = drop_first
self.values = collections.defaultdict(set)
self.categories = categories

if self.categories == "auto":
self.values = collections.defaultdict(set)
else:
self.values = self.categories

def learn_one(self, x):
if self.drop_zeros:
return

for i, xi in x.items():
if isinstance(xi, list) or isinstance(xi, set):
for xj in xi:
self.values[i].add(xj)
else:
self.values[i].add(xi)
# NOTE: assume if category mappings are explicitly provided,
# they're intended to be kept fixed.
if self.categories == "auto":
for i, xi in x.items():
if isinstance(xi, list) or isinstance(xi, set):
for xj in xi:
self.values[i].add(xj)
else:
self.values[i].add(xi)

def transform_one(self, x, y=None):
oh = {}
Expand All @@ -217,13 +260,25 @@ def transform_one(self, x, y=None):
if not self.drop_zeros:
oh = {f"{i}_{v}": 0 for i, values in self.values.items() for v in values}

# Add 1s
for i, xi in x.items():
if isinstance(xi, list) or isinstance(xi, set):
for xj in xi:
oh[f"{i}_{xj}"] = 1
else:
oh[f"{i}_{xi}"] = 1
# Add 1
# NOTE: assume if category mappings are explicitly provided,
# no other category values are allowed for output. Aligns with `sklearn` behavior.
if self.categories == "auto":
for i, xi in x.items():
if isinstance(xi, list) or isinstance(xi, set):
for xj in xi:
oh[f"{i}_{xj}"] = 1
else:
oh[f"{i}_{xi}"] = 1
else:
for i, xi in x.items():
if isinstance(xi, list) or isinstance(xi, set):
for xj in xi:
if xj in self.values[i]:
oh[f"{i}_{xj}"] = 1
else:
if xi in self.values[i]:
oh[f"{i}_{xi}"] = 1

if self.drop_first:
oh.pop(min(oh.keys()))
Expand All @@ -234,12 +289,22 @@ def learn_many(self, X):
if self.drop_zeros:
return

for col in X.columns:
self.values[col].update(X[col].unique())
# NOTE: assume if category mappings are explicitly provided,
# they're intended to be kept fixed.
if self.categories == "auto":
for col in X.columns:
self.values[col].update(X[col].unique())

def transform_many(self, X):
oh = pd.get_dummies(X, columns=X.columns, sparse=True, dtype="uint8")

# NOTE: assume if category mappings are explicitly provided,
# no other category values are allowed for output. Aligns with `sklearn` behavior.
if self.categories != "auto":
seen_in_the_past = {f"{col}_{val}" for col, vals in self.values.items() for val in vals}
to_remove = set(oh.columns) - seen_in_the_past
oh.drop(columns=list(to_remove), inplace=True)

if not self.drop_zeros:
seen_in_the_past = {f"{col}_{val}" for col, vals in self.values.items() for val in vals}
to_add = seen_in_the_past - set(oh.columns)
Expand Down
63 changes: 46 additions & 17 deletions river/preprocessing/ordinal.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,26 @@ class OrdinalEncoder(base.MiniBatchTransformer):
{'country': 2, 'place': 1}
{'country': -1, 'place': -1}

Like in `scikit-learn`, you can also specify the expected categories manually.
This is handy when you want to constrain category encoding space
to e.g. top 20% most popular category values you've picked in advance.

>>> categories = {'country': {'France': 1},
... 'place': {'Burger King': 2, 'Starbucks': 3}}
>>> encoder = preprocessing.OrdinalEncoder(categories=categories)
>>> for x in X:
... print(encoder.transform_one(x))
... encoder.learn_one(x)
{'country': 1, 'place': 0}
{'country': -1, 'place': -1}
{'country': 0, 'place': 2}
{'country': 1, 'place': 2}
{'country': 0, 'place': 3}
{'country': 0, 'place': 3}
{'country': 0, 'place': 0}
{'country': -1, 'place': -1}

>>> import pandas as pd
>>> xb1 = pd.DataFrame(X[0:4], index=[0, 1, 2, 3])
>>> xb2 = pd.DataFrame(X[4:8], index=[4, 5, 6, 7])

Expand All @@ -87,39 +107,47 @@ class OrdinalEncoder(base.MiniBatchTransformer):

def __init__(
self,
categories = "auto",
unknown_value: int | None = 0,
none_value: int = -1,
):
self.unknown_value = unknown_value
self.none_value = none_value
self.categories = categories
self.values: collections.defaultdict | dict | None = None

# We're going to have one auto-incrementing counter per feature. This counter will generate
# the category codes for each feature.
self._counters: collections.defaultdict = collections.defaultdict(
functools.partial(make_counter, {unknown_value, none_value})
)
if self.categories == "auto":
# We're going to have one auto-incrementing counter per feature. This counter will generate
# the category codes for each feature.
self._counters: collections.defaultdict = collections.defaultdict(
functools.partial(make_counter, {unknown_value, none_value})
)

# We're going to store the categories in a dict of dicts. The outer dict will map each
# feature to its inner dict. The inner dict will map each category to its code.
self.values = collections.defaultdict(dict)
else:
self.values = self.categories

# We're going to store the categories in a dict of dicts. The outer dict will map each
# feature to its inner dict. The inner dict will map each category to its code.
self.categories: collections.defaultdict = collections.defaultdict(dict)

def transform_one(self, x):
return {
i: self.none_value if xi is None else self.categories[i].get(xi, self.unknown_value)
i: self.none_value if xi is None else self.values[i].get(xi, self.unknown_value)
for i, xi in x.items()
}

def learn_one(self, x):
for i, xi in x.items():
if xi is not None and xi not in self.categories[i]:
self.categories[i][xi] = next(self._counters[i])
if self.categories == "auto":
for i, xi in x.items():
if xi is not None and xi not in self.values[i]:
self.values[i][xi] = next(self._counters[i])

def transform_many(self, X):
return pd.DataFrame(
{
i: pd.Series(
X[i]
.map({**self.categories[i], None: self.none_value})
.map({**self.values[i], None: self.none_value})
.fillna(self.unknown_value),
dtype=np.int64,
)
Expand All @@ -128,7 +156,8 @@ def transform_many(self, X):
)

def learn_many(self, X, y=None):
for i in X.columns:
for xi in X[i].dropna().unique():
if xi not in self.categories[i]:
self.categories[i][xi] = next(self._counters[i])
if self.categories == "auto":
for i in X.columns:
for xi in X[i].dropna().unique():
if xi not in self.values[i]:
self.values[i][xi] = next(self._counters[i])
Loading