-
Notifications
You must be signed in to change notification settings - Fork 112
Open
Description
Observed 2 Bugs related to OptimalBinningSketch class.
1 Missing and Special record counts are added into (-inf, inf) bin count, when no splits can be obtained.
From below example can be seen that in the binning table produced, the missing and special records are (double) counted within the record counts for bin (-inf, inf).
Example
import copy
import pandas as pd
from optbinning import OptimalBinningSketch
binning_kwargs = {
"dtype": "numerical",
"solver": "cp",
"divergence": "iv",
"max_n_prebins": 20,
"min_bin_size": 0.05,
"max_n_bins": 10,
"split_digits": 4,
"monotonic_trend": "auto_asc_desc",
"special_codes": [
-9999999999.0,
],
}
optbinning_obj = OptimalBinningSketch(name="var", **binning_kwargs)
records = [
{"group": 1, "var": 0, "target": 1},
{"group": 1, "var": binning_kwargs["special_codes"][0], "target": 1}, # Special
{"group": 1, "var": -10, "target": 0},
{"group": 2, "var": 5, "target": 1},
{"group": 2, "var": 0, "target": 1},
{"group": 2, "var": None, "target": 0}, # Missing
{"group": 3, "var": 10, "target": 1},
{"group": 3, "var": 0, "target": 1},
{"group": 3, "var": -3, "target": 0},
]
input_df = pd.DataFrame(data=records)
expected_records = [
{"Bin": "(-inf, inf)", "Count": 7, "Non-event": 2, "Event": 5},
{"Bin": "Special", "Count": 1, "Non-event": 0, "Event": 1},
{"Bin": "Missing", "Count": 1, "Non-event": 1, "Event": 0},
]
expected_df = pd.DataFrame(data=expected_records)
expected_df = expected_df.astype({"Count": "int64"})
expected_df.index = expected_df.index.astype("object")
groups = input_df["group"].unique()
optbinning_list = []
for group in groups:
x = input_df[input_df["group"] == group]["var"]
y = input_df[input_df["group"] == group]["target"]
temp_optbinning_obj = copy.deepcopy(optbinning_obj)
temp_optbinning_obj.add(x, y)
optbinning_list.append(temp_optbinning_obj)
final_optbinning_obj = optbinning_list[0]
for optbinning_entry in optbinning_list[1:]:
final_optbinning_obj.merge(optbinning_entry)
final_optbinning_obj.solve()
# Results in AssertionError.
pd.testing.assert_frame_equal(
left=final_optbinning_obj.binning_table.build().loc[
0:2, ["Bin", "Count", "Non-event", "Event"]
],
right=expected_df,
)input data:
| group | var | target |
|---|---|---|
| 1 | 0 | 1 |
| 1 | -9999999999 | 1 |
| 1 | -10 | 0 |
| 2 | 5 | 1 |
| 2 | 0 | 1 |
| 2 | NULL | 0 |
| 3 | 10 | 1 |
| 3 | 0 | 1 |
| 3 | -3 | 0 |
expected binning table:
| Bin | Count | Non-event | Event |
|---|---|---|---|
| (-inf, inf) | 7 | 2 | 5 |
| Special | 1 | 0 | 1 |
| Missing | 1 | 1 | 0 |
actual binning table:
| Bin | Count | Non-event | Event |
|---|---|---|---|
| (-inf, inf) | 9 | 3 | 6 |
| Special | 1 | 0 | 1 |
| Missing | 1 | 1 | 0 |
Fix
Fix I did locally to overcome issue:
...
def solve(self):
"""Solve optimal binning using added data.
Returns
-------
self : OptimalBinningSketch
Current fitted optimal binning.
"""
time_init = time.perf_counter()
# Check if data was added
if not self._n_add:
raise NotFittedError("No data was added. Add data before solving.")
# Pre-binning
if self.verbose:
logger.info("Pre-binning started.")
time_prebinning = time.perf_counter()
splits, n_nonevent, n_event = self._prebinning_data()
self._n_prebins = len(splits) + 1
self._time_prebinning = time.perf_counter() - time_prebinning
if self.verbose:
logger.info("Pre-binning: number of prebins: {}".format(self._n_prebins))
logger.info(
"Pre-binning: number of refinements: {}".format(self._n_refinements)
)
logger.info(
"Pre-binning terminated. Time: {:.4f}s".format(self._time_prebinning)
)
# Optimization
self._fit_optimizer(splits, n_nonevent, n_event)
# Post-processing
if self.verbose:
logger.info("Post-processing started.")
logger.info("Post-processing: compute binning information.")
time_postprocessing = time.perf_counter()
# Changed: Remove nr of missing and special from total counts.
if not len(splits):
n_nonevent = np.array(
[
self._t_n_nonevent
- (self._n_nonevent_missing + self._n_nonevent_special)
]
)
n_event = np.array(
[self._t_n_event - (self._n_event_missing + self._n_event_special)]
)
self._n_nonevent, self._n_event = bin_info(
self._solution,
n_nonevent,
n_event,
self._n_nonevent_missing,
self._n_event_missing,
self._n_nonevent_special,
self._n_event_special,
self._n_nonevent_cat_others,
self._n_event_cat_others,
self._cat_others,
)
self._binning_table = BinningTable(
self.name,
self.dtype,
self.special_codes,
self._splits_optimal,
self._n_nonevent,
self._n_event,
None,
None,
self._categories,
self._cat_others,
None,
)
self._time_postprocessing = time.perf_counter() - time_postprocessing
if self.verbose:
logger.info(
"Post-processing terminated. Time: {:.4f}s".format(
self._time_postprocessing
)
)
self._time_total = time.perf_counter() - time_init
self._time_streaming_solve += self._time_total
self._n_solve += 1
if self.verbose:
logger.info(
"Optimal binning terminated. Status: {}. Time: {:.4f}s".format(
self._status, self._time_total
)
)
# Completed successfully
self._is_solved = True
self._update_streaming_stats()
return self2 Batches that only contain missing and/or special records are 'thrown out' when merging in BSketch class.
From below example can be seen that batches for which there are only missing/special records are disregarded when merging, and the counts for these batches is hence left out of the final binning table.
Example
import copy
import pandas as pd
from optbinning import OptimalBinningSketch
binning_kwargs = {
"dtype": "numerical",
"solver": "cp",
"divergence": "iv",
"max_n_prebins": 20,
"min_bin_size": 0.05,
"max_n_bins": 10,
"split_digits": 4,
"monotonic_trend": "auto_asc_desc",
"special_codes": [
-9999999999.0,
],
}
optbinning_obj = OptimalBinningSketch(name="var", **binning_kwargs)
records = [
# Group 1
{"group": 1, "var": -20, "target": 0},
{"group": 1, "var": -15, "target": 0},
{"group": 1, "var": -10, "target": 0},
{"group": 1, "var": -5, "target": 0},
{"group": 1, "var": 0, "target": 1},
{"group": 1, "var": 5, "target": 1},
{"group": 1, "var": 10, "target": 1},
{"group": 1, "var": 15, "target": 1},
{"group": 1, "var": binning_kwargs["special_codes"][0], "target": 0}, # Special
# Group 2
{"group": 2, "var": None, "target": 1}, # Missing
# Group 3
{"group": 3, "var": -50, "target": 0},
{"group": 3, "var": -40, "target": 0},
{"group": 3, "var": -30, "target": 0},
{"group": 3, "var": -20, "target": 1},
{"group": 3, "var": -10, "target": 1},
{"group": 3, "var": 0, "target": 1},
{"group": 3, "var": 10, "target": 1},
{"group": 3, "var": 20, "target": 0},
{"group": 3, "var": 30, "target": 0},
]
input_df = pd.DataFrame(data=records)
expected_records = [
{
"Bin": "(-inf, -20.00)",
"Count": 5,
"Non-event": 4,
"Event": 1
},
{
"Bin": "[-20.00, -10.00)",
"Count": 3,
"Non-event": 2,
"Event": 1
},
{
"Bin": "[-10.00, inf)",
"Count": 9,
"Non-event": 3,
"Event": 6
},
{
"Bin": "Special",
"Count": 1,
"Non-event": 1,
"Event": 0
},
{
"Bin": "Missing",
"Count": 1,
"Non-event": 0,
"Event": 1
},
]
expected_df = pd.DataFrame(data=expected_records)
expected_df = expected_df.astype({"Count": "int64"})
expected_df.index = expected_df.index.astype("object")
groups = input_df["group"].unique()
optbinning_list = []
for group in groups:
x = input_df[input_df["group"] == group]["var"]
y = input_df[input_df["group"] == group]["target"]
temp_optbinning_obj = copy.deepcopy(optbinning_obj)
temp_optbinning_obj.add(x, y)
optbinning_list.append(temp_optbinning_obj)
final_optbinning_obj = optbinning_list[0]
for optbinning_entry in optbinning_list[1:]:
final_optbinning_obj.merge(optbinning_entry)
final_optbinning_obj.solve()
# Results in AssertionError.
pd.testing.assert_frame_equal(
left=final_optbinning_obj.binning_table.build().loc[
0:4, ["Bin", "Count", "Non-event", "Event"]
],
right=expected_df,
)input data:
| group | var | target |
|---|---|---|
| 1 | -20 | 0 |
| 1 | -15 | 0 |
| 1 | -10 | 0 |
| 1 | -5 | 0 |
| 1 | 0 | 1 |
| 1 | 5 | 1 |
| 1 | 10 | 1 |
| 1 | 15 | 1 |
| 1 | -9999999999 | 0 |
| 2 | NULL | 1 |
| 3 | -50 | 0 |
| 3 | -40 | 0 |
| 3 | -30 | 0 |
| 3 | -20 | 1 |
| 3 | -10 | 1 |
| 3 | 0 | 1 |
| 3 | 10 | 1 |
| 3 | 20 | 0 |
| 3 | 30 | 0 |
expected binning table:
| Bin | Count | Non-event | Event |
|---|---|---|---|
| (-inf, -20.00) | 5 | 4 | 1 |
| [-20.00, -10.00) | 3 | 2 | 1 |
| [-10.00, inf) | 9 | 3 | 6 |
| Special | 1 | 1 | 0 |
| Missing | 1 | 0 | 1 |
actual binning table:
| Bin | Count | Non-event | Event |
|---|---|---|---|
| (-inf, -20.00) | 5 | 4 | 1 |
| [-20.00, -10.00) | 3 | 2 | 1 |
| [-10.00, inf) | 9 | 3 | 6 |
| Special | 1 | 1 | 0 |
| Missing | 0 | 0 | 0 |
Fix
Fix I did locally to overcome issue (I only use GK):
...
def merge(self, bsketch):
"""Merge current instance with another BSketch instance.
Parameters
----------
bsketch : object
BSketch instance.
"""
if not self._mergeable(bsketch):
raise Exception("bsketch does not share signature.")
# Changed: make sure bsketch arg has no missing or special records before 'throwing away'.
if (
bsketch._sketch_e.n == 0
and bsketch._count_missing_e == 0
and bsketch._count_special_e == 0
and bsketch._sketch_ne.n == 0
and bsketch._count_missing_ne == 0
and bsketch._count_special_ne == 0
):
return
# Changed: make sure the current instance has no missing or special records before 'throwing away'.
if (
self._sketch_e.n == 0
and self._count_missing_e == 0
and self._count_special_e == 0
and self._sketch_ne.n == 0
and self._count_missing_ne == 0
and self._count_special_ne == 0
):
self._copy(bsketch)
return
# Merge sketches
if self.sketch == "gk":
# Changed: make sure bsketch arg has 'normal' records before merging.
if not bsketch._sketch_e.n == 0:
self._sketch_e.merge(bsketch._sketch_e)
if not bsketch._sketch_ne.n == 0:
self._sketch_ne.merge(bsketch._sketch_ne)
elif self.sketch == "t-digest":
self._sketch_e += bsketch._sketch_e
self._sketch_ne += bsketch._sketch_ne
# Merge missing and special counts
self._count_missing_e += bsketch._count_missing_e
self._count_missing_ne += bsketch._count_missing_ne
self._count_special_e += bsketch._count_special_e
self._count_special_ne += bsketch._count_special_neMetadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working