Skip to content

Record count bugs in OptimalBinningSketch #368

@mdaemen

Description

@mdaemen

Observed 2 Bugs related to OptimalBinningSketch class.

1 Missing and Special record counts are added into (-inf, inf) bin count, when no splits can be obtained.

From below example can be seen that in the binning table produced, the missing and special records are (double) counted within the record counts for bin (-inf, inf).

Example

import copy

import pandas as pd
from optbinning import OptimalBinningSketch

binning_kwargs = {
    "dtype": "numerical",
    "solver": "cp",
    "divergence": "iv",
    "max_n_prebins": 20,
    "min_bin_size": 0.05,
    "max_n_bins": 10,
    "split_digits": 4,
    "monotonic_trend": "auto_asc_desc",
    "special_codes": [
        -9999999999.0,
    ],
}
optbinning_obj = OptimalBinningSketch(name="var", **binning_kwargs)

records = [
    {"group": 1, "var": 0, "target": 1},
    {"group": 1, "var": binning_kwargs["special_codes"][0], "target": 1},  # Special
    {"group": 1, "var": -10, "target": 0},
    {"group": 2, "var": 5, "target": 1},
    {"group": 2, "var": 0, "target": 1},
    {"group": 2, "var": None, "target": 0},  # Missing
    {"group": 3, "var": 10, "target": 1},
    {"group": 3, "var": 0, "target": 1},
    {"group": 3, "var": -3, "target": 0},
]
input_df = pd.DataFrame(data=records)

expected_records = [
    {"Bin": "(-inf, inf)", "Count": 7, "Non-event": 2, "Event": 5},
    {"Bin": "Special", "Count": 1, "Non-event": 0, "Event": 1},
    {"Bin": "Missing", "Count": 1, "Non-event": 1, "Event": 0},
]
expected_df = pd.DataFrame(data=expected_records)
expected_df = expected_df.astype({"Count": "int64"})
expected_df.index = expected_df.index.astype("object")

groups = input_df["group"].unique()

optbinning_list = []

for group in groups:
    x = input_df[input_df["group"] == group]["var"]
    y = input_df[input_df["group"] == group]["target"]

    temp_optbinning_obj = copy.deepcopy(optbinning_obj)
    temp_optbinning_obj.add(x, y)
    optbinning_list.append(temp_optbinning_obj)

final_optbinning_obj = optbinning_list[0]
for optbinning_entry in optbinning_list[1:]:
    final_optbinning_obj.merge(optbinning_entry)

final_optbinning_obj.solve()

# Results in AssertionError.
pd.testing.assert_frame_equal(
    left=final_optbinning_obj.binning_table.build().loc[
        0:2, ["Bin", "Count", "Non-event", "Event"]
    ],
    right=expected_df,
)

input data:

group var target
1 0 1
1 -9999999999 1
1 -10 0
2 5 1
2 0 1
2 NULL 0
3 10 1
3 0 1
3 -3 0

expected binning table:

Bin Count Non-event Event
(-inf, inf) 7 2 5
Special 1 0 1
Missing 1 1 0

actual binning table:

Bin Count Non-event Event
(-inf, inf) 9 3 6
Special 1 0 1
Missing 1 1 0

Fix

Code causing the bug.

Fix I did locally to overcome issue:

    ...
    def solve(self):
        """Solve optimal binning using added data.

        Returns
        -------
        self : OptimalBinningSketch
            Current fitted optimal binning.
        """
        time_init = time.perf_counter()

        # Check if data was added
        if not self._n_add:
            raise NotFittedError("No data was added. Add data before solving.")

        # Pre-binning
        if self.verbose:
            logger.info("Pre-binning started.")

        time_prebinning = time.perf_counter()

        splits, n_nonevent, n_event = self._prebinning_data()
        self._n_prebins = len(splits) + 1

        self._time_prebinning = time.perf_counter() - time_prebinning

        if self.verbose:
            logger.info("Pre-binning: number of prebins: {}".format(self._n_prebins))
            logger.info(
                "Pre-binning: number of refinements: {}".format(self._n_refinements)
            )

            logger.info(
                "Pre-binning terminated. Time: {:.4f}s".format(self._time_prebinning)
            )

        # Optimization
        self._fit_optimizer(splits, n_nonevent, n_event)

        # Post-processing
        if self.verbose:
            logger.info("Post-processing started.")
            logger.info("Post-processing: compute binning information.")

        time_postprocessing = time.perf_counter()

        # Changed: Remove nr of missing and special from total counts.
        if not len(splits):
            n_nonevent = np.array(
                [
                    self._t_n_nonevent
                    - (self._n_nonevent_missing + self._n_nonevent_special)
                ]
            )
            n_event = np.array(
                [self._t_n_event - (self._n_event_missing + self._n_event_special)]
            )

        self._n_nonevent, self._n_event = bin_info(
            self._solution,
            n_nonevent,
            n_event,
            self._n_nonevent_missing,
            self._n_event_missing,
            self._n_nonevent_special,
            self._n_event_special,
            self._n_nonevent_cat_others,
            self._n_event_cat_others,
            self._cat_others,
        )

        self._binning_table = BinningTable(
            self.name,
            self.dtype,
            self.special_codes,
            self._splits_optimal,
            self._n_nonevent,
            self._n_event,
            None,
            None,
            self._categories,
            self._cat_others,
            None,
        )

        self._time_postprocessing = time.perf_counter() - time_postprocessing

        if self.verbose:
            logger.info(
                "Post-processing terminated. Time: {:.4f}s".format(
                    self._time_postprocessing
                )
            )

        self._time_total = time.perf_counter() - time_init
        self._time_streaming_solve += self._time_total
        self._n_solve += 1

        if self.verbose:
            logger.info(
                "Optimal binning terminated. Status: {}. Time: {:.4f}s".format(
                    self._status, self._time_total
                )
            )

        # Completed successfully
        self._is_solved = True
        self._update_streaming_stats()

        return self

2 Batches that only contain missing and/or special records are 'thrown out' when merging in BSketch class.

From below example can be seen that batches for which there are only missing/special records are disregarded when merging, and the counts for these batches is hence left out of the final binning table.

Example

import copy

import pandas as pd
from optbinning import OptimalBinningSketch

binning_kwargs = {
    "dtype": "numerical",
    "solver": "cp",
    "divergence": "iv",
    "max_n_prebins": 20,
    "min_bin_size": 0.05,
    "max_n_bins": 10,
    "split_digits": 4,
    "monotonic_trend": "auto_asc_desc",
    "special_codes": [
        -9999999999.0,
    ],
}
optbinning_obj = OptimalBinningSketch(name="var", **binning_kwargs)

records = [
    # Group 1
    {"group": 1, "var": -20, "target": 0},
    {"group": 1, "var": -15, "target": 0},
    {"group": 1, "var": -10, "target": 0},
    {"group": 1, "var": -5, "target": 0},
    {"group": 1, "var": 0, "target": 1},
    {"group": 1, "var": 5, "target": 1},
    {"group": 1, "var": 10, "target": 1},
    {"group": 1, "var": 15, "target": 1},
    {"group": 1, "var": binning_kwargs["special_codes"][0], "target": 0},  # Special

    # Group 2
    {"group": 2, "var": None, "target": 1},  # Missing

    # Group 3
    {"group": 3, "var": -50, "target": 0},
    {"group": 3, "var": -40, "target": 0},
    {"group": 3, "var": -30, "target": 0},
    {"group": 3, "var": -20, "target": 1},
    {"group": 3, "var": -10, "target": 1},
    {"group": 3, "var": 0, "target": 1},
    {"group": 3, "var": 10, "target": 1},
    {"group": 3, "var": 20, "target": 0},
    {"group": 3, "var": 30, "target": 0},
]
input_df = pd.DataFrame(data=records)

expected_records = [
    {
        "Bin": "(-inf, -20.00)",
        "Count": 5,
        "Non-event": 4,
        "Event": 1
    },
    {
        "Bin": "[-20.00, -10.00)",
        "Count": 3,
        "Non-event": 2,
        "Event": 1
    },
    {
        "Bin": "[-10.00, inf)",
        "Count": 9,
        "Non-event": 3,
        "Event": 6
    },
    {
        "Bin": "Special",
        "Count": 1,
        "Non-event": 1,
        "Event": 0
    },
    {
        "Bin": "Missing",
        "Count": 1,
        "Non-event": 0,
        "Event": 1
    },
]
expected_df = pd.DataFrame(data=expected_records)
expected_df = expected_df.astype({"Count": "int64"})
expected_df.index = expected_df.index.astype("object")

groups = input_df["group"].unique()

optbinning_list = []

for group in groups:
    x = input_df[input_df["group"] == group]["var"]
    y = input_df[input_df["group"] == group]["target"]

    temp_optbinning_obj = copy.deepcopy(optbinning_obj)
    temp_optbinning_obj.add(x, y)
    optbinning_list.append(temp_optbinning_obj)

final_optbinning_obj = optbinning_list[0]
for optbinning_entry in optbinning_list[1:]:
    final_optbinning_obj.merge(optbinning_entry)

final_optbinning_obj.solve()

# Results in AssertionError.
pd.testing.assert_frame_equal(
    left=final_optbinning_obj.binning_table.build().loc[
        0:4, ["Bin", "Count", "Non-event", "Event"]
    ],
    right=expected_df,
)

input data:

group var target
1 -20 0
1 -15 0
1 -10 0
1 -5 0
1 0 1
1 5 1
1 10 1
1 15 1
1 -9999999999 0
2 NULL 1
3 -50 0
3 -40 0
3 -30 0
3 -20 1
3 -10 1
3 0 1
3 10 1
3 20 0
3 30 0

expected binning table:

Bin Count Non-event Event
(-inf, -20.00) 5 4 1
[-20.00, -10.00) 3 2 1
[-10.00, inf) 9 3 6
Special 1 1 0
Missing 1 0 1

actual binning table:

Bin Count Non-event Event
(-inf, -20.00) 5 4 1
[-20.00, -10.00) 3 2 1
[-10.00, inf) 9 3 6
Special 1 1 0
Missing 0 0 0

Fix

Code causing the bug

Fix I did locally to overcome issue (I only use GK):

    ...
    def merge(self, bsketch):
        """Merge current instance with another BSketch instance.

        Parameters
        ----------
        bsketch : object
            BSketch instance.
        """
        if not self._mergeable(bsketch):
            raise Exception("bsketch does not share signature.")

        # Changed: make sure bsketch arg has no missing or special records before 'throwing away'.
        if (
            bsketch._sketch_e.n == 0
            and bsketch._count_missing_e == 0
            and bsketch._count_special_e == 0
            and bsketch._sketch_ne.n == 0
            and bsketch._count_missing_ne == 0
            and bsketch._count_special_ne == 0
        ):
            return

        # Changed: make sure the current instance has no missing or special records before 'throwing away'.
        if (
            self._sketch_e.n == 0
            and self._count_missing_e == 0
            and self._count_special_e == 0
            and self._sketch_ne.n == 0
            and self._count_missing_ne == 0
            and self._count_special_ne == 0
        ):
            self._copy(bsketch)
            return

        # Merge sketches
        if self.sketch == "gk":
            # Changed: make sure bsketch arg has 'normal' records before merging.
            if not bsketch._sketch_e.n == 0:
                self._sketch_e.merge(bsketch._sketch_e)
            if not bsketch._sketch_ne.n == 0:
                self._sketch_ne.merge(bsketch._sketch_ne)
        elif self.sketch == "t-digest":
            self._sketch_e += bsketch._sketch_e
            self._sketch_ne += bsketch._sketch_ne

        # Merge missing and special counts
        self._count_missing_e += bsketch._count_missing_e
        self._count_missing_ne += bsketch._count_missing_ne
        self._count_special_e += bsketch._count_special_e
        self._count_special_ne += bsketch._count_special_ne

Metadata

Metadata

Labels

bugSomething isn't working

Projects

No projects

Relationships

None yet

Development

No branches or pull requests

Issue actions