Add the flat-s hist hook as option for histograms

Within our histogram pipeline no rebinning is currently possible. This kind of feature is useful for inference models in combine. implements a hist-hook called "flat-s" that takes a finely binned histogram and rebins the histogram for given constraints. The hist-hook is split in 2 parts: - finding edges that full-fill the constraints - apply edges on given "Hist" histograms
uhh-cms · Oct 24, 2024 · 10069a7 · 10069a7
1 parent 902b7bf
commit 10069a7
Show file tree

Hide file tree

Showing 3 changed files with 283 additions and 0 deletions.
diff --git a/hbt/config/configs_hbt.py b/hbt/config/configs_hbt.py
@@ -147,6 +147,10 @@ def if_era(
         if process_name.startswith(("graviton_hh_", "radion_hh_")):
             proc.add_tag("signal")
             proc.add_tag("resonant_signal")
+        if process_name.startswith("tt"):
+            proc.add_tag("is_ttbar")
+        if process_name.startswith("dy"):
+            proc.add_tag("is_dy")
 
         # add the process
         cfg.add_process(proc)

diff --git a/hbt/config/hist_hooks.py b/hbt/config/hist_hooks.py
@@ -187,6 +187,278 @@ def qcd_estimation(task, hists):
 
         return hists
 
+    def flat_s(task, hists: dict[hist.Histogram]) -> dict[hist.Histogram]:
+        """Rebinnig of the histograms in *hists* to archieve a flat-signal distribution.
+        Args:
+
+            task (TODO): task instance that contains the process informations
+            hists (Dict[hist.Histogram]): A dictionary of histograms using Process instances as keys
+
+        Returns:
+            Dict[hist.Histogram]: A dictionary of histograms using Process instances as keys
+        """
+        def find_edges(signal_histogram, background_histograms, variable, n_bins=10) -> tuple[np.ndarray, np.ndarray]:
+            """
+            Determine new bin edges that result in a flat signal distribution.
+            The edges are determined by the signal distribution, while the background distribution
+            is used to ensure that the background yield in each bin is sufficient.
+            """
+            def get_integral(cumulative_weights, stop, offset=0):
+                """
+                Helper to calculate the integral of *cumulative_weights* between the *offset* (included)
+                and the *stop* index (not included)
+                """
+                return cumulative_weights[stop - 1] - (0 if offset == 0 else cumulative_weights[offset - 1])
+
+            def prepare_background(histogram: hist.Histogram) -> tuple[np.ndarray, np.ndarray, np.ndarray]: # noqa
+                """
+                Helper to extract information from background histograms.
+
+                Returns:
+                    tuple[np.ndarray]: A tuple containing the array that describe bin yield,
+                    the number of equivalent bins and the cumulative bin yield.
+                """
+                bin_yield = histogram.counts()
+                # y^2 / sigma^2, where y is the yield, sigma is the uncertainty
+                # these are the number of events with weight 1 and same statistical fluctuation
+                number_of_equivalent_bins = bin_yield**2 / histogram.variances()
+                bin_yield = np.flip(bin_yield, axis=-1)
+                cumulative_bin_yield = np.cumsum(bin_yield, axis=0)
+                return (
+                    bin_yield,
+                    np.flip(number_of_equivalent_bins, axis=-1),
+                    cumulative_bin_yield,
+                )
+            # prepare parameters
+            low_edge, max_edge = 0, 1
+            bin_edges = [max_edge]
+            indices_gathering = [0]
+
+            # bookkeep reasons for stopping binning
+            stop_reason = ""
+            # accumulated signal yield up to the current index
+            y_already_binned = 0.0
+            y_min = 1.0e-5
+            # during binning, do not remove leading entries
+            # instead remember the index that denotes the start of the bin
+            offset = 0
+
+            # prepare signal
+            # fine binned histograms bin centers are approx equivalent to dnn output
+            # flip arrays to start from the right
+            dnn_score_signal = np.flip(signal_histogram.axes[variable].centers, axis=-1)
+            y_signal = np.flip(signal_histogram.counts(), axis=-1)
+
+            # calculate cumulative of reversed signal yield and yield per bin
+            cumulu_y_signal = np.cumsum(y_signal, axis=0)
+            full_cum = cumulu_y_signal[-1]
+            y_per_bin = full_cum / n_bins
+            num_events = len(cumulu_y_signal)
+
+            # prepare background
+
+            for process, histogram in background_histograms.items():
+                if process.name == "tt":
+                    tt_y, tt_num_eq, cumulu_tt_y = prepare_background(histogram)
+                elif process.name == "dy":
+                    dy_y, dy_num_eq, cumulu_dy_y = prepare_background(histogram)
+
+            # start binning
+            while len(bin_edges) < n_bins:
+                # stopping condition 1: reached end of events
+                if offset >= num_events:
+                    stop_reason = "no more events left"
+                    break
+                # stopping condition 2: remaining signal yield too small
+                # this would lead to a bin complelty filled with background
+                y_remaining = full_cum - y_already_binned
+                if y_remaining < y_min:
+                    stop_reason = "remaining signal yield insufficient"
+                    break
+                # find the index of the event that would result in a signal yield increase of more
+                # than the expected per-bin yield;
+                # this index would mark the start of the next bin given all constraints are met
+                if y_remaining >= y_per_bin:
+                    threshold = y_already_binned + y_per_bin
+                    # get indices of array of values above threshold
+                    # first entry defines the next bin edge
+                    # shift next idx by offset
+                    next_idx = offset + np.where(cumulu_y_signal[offset:] > threshold)[0][0]
+                else:
+                    # special case: remaining signal yield smaller than the expected per-bin yield,
+                    # so find the last event
+                    next_idx = offset + np.where(cumulu_y_signal[offset:])[0][-1] + 1
+
+                # advance the index until backgrounds constraints are met
+
+                # combine tt and dy histograms
+
+                # Background constraints
+                while next_idx < num_events:
+                    # get the number of monte carlo tt and dy events
+                    tt_num_events = get_integral(tt_num_eq, next_idx, offset)
+                    dy_num_events = get_integral(tt_num_eq, next_idx, offset)
+                    tt_yield = get_integral(cumulu_tt_y, next_idx, offset)
+                    dy_yield = get_integral(cumulu_dy_y, next_idx, offset)
+
+                    # evaluate constraints
+                    # TODO: potentially relax constraints here, e.g when there are 3 (4?) tt events, drop the constraint
+                    #       on dy, and vice-versa
+                    constraints_met = (
+                        # have atleast 1 tt, 1 dy and atleast 4 background events
+                        # scale by lumi ratio to be more fair to the smaller dataset
+                        tt_num_events >= 1 and
+                        dy_num_events >= 1 and
+                        tt_num_events + dy_num_events >= 4 and
+
+                        # yields must be positive to avoid negative sums of weights per process
+                        tt_yield > 0 and
+                        dy_yield > 0
+                    )
+                    if constraints_met:
+                        # TODO: maybe also check if the background conditions are just barely met and advance next_idx
+                        # to the middle between the current value and the next one that would change anything about the
+                        # background predictions; this might be more stable as the current implementation can highly
+                        # depend on the exact value of a single event (the one that tips the constraints over the edge
+                        # to fulfillment)
+                        # bin found, stop
+                        break
+
+                    # constraints not met, advance index to include the next tt or dy event and try again
+                    next_idx += 1
+                else:
+                    # stopping condition 3: no more events left, so the last bin (most left one) does not fullfill
+                    # constraints; however, this should practically never happen
+                    stop_reason = "no more events left while trying to fulfill constraints"
+                    break
+
+                # next_idx found, update values
+                # get next edge or set to low edge if end is reached
+                if next_idx == num_events:
+                    edge_value = low_edge
+                else:
+                    # calculate bin center as new edge
+                    edge_value = float(dnn_score_signal[next_idx - 1:next_idx + 1].mean())
+                # prevent out of bounds values and push them to the boundaries
+                bin_edges.append(max(min(edge_value, max_edge), low_edge))
+
+                y_already_binned += get_integral(cumulu_y_signal, next_idx, offset)
+                offset = next_idx
+                indices_gathering.append(next_idx)
+
+            # make sure the lower dnn_output (max events) is included
+            if bin_edges[-1] != low_edge:
+                if len(bin_edges) > n_bins:
+                    raise RuntimeError(f"number of bins reached and initial bin edge is not minimal bin edge (edges: {bin_edges})") # noqa
+                bin_edges.append(low_edge)
+                indices_gathering.append(num_events)
+
+            # some debugging output
+            n_bins_actual = len(bin_edges) - 1
+            if n_bins_actual > n_bins:
+                raise Exception("number of actual bins ended up larger than requested (implementation bug)")
+            if n_bins_actual < n_bins:
+                print(
+                    f"  started from {num_events} bins, targeted {n_bins} but ended at {n_bins_actual} bins\n"
+                    f"    -> reason: {stop_reason or 'NO REASON!?'}",
+                )
+                n_bins = n_bins_actual
+
+            # flip indices to the right order
+            indices_gathering = (np.flip(indices_gathering) - num_events) * -1
+            return np.flip(np.array(bin_edges), axis=-1), indices_gathering
+
+        def apply_edges(h: hist.Hist, edges: np.ndarray, indices: np.ndarray, variable: tuple[str]) -> hist.Hist:
+            """
+            Rebin the content axes determined by *variables* of a given hist histogram *h* to
+            given *edges* and their *indices*.
+            The rebinned histogram is returned.
+
+            Args:
+            h (hist.Hist): hist Histogram that is to be rebinned
+            edges (np.ndarray): a array of ascending bin edges
+            indices (np.ndarray): a array of indices that define the new bin edges
+            variables (str): variable name that is rebinned
+
+            Returns:
+                hist.Hist: rebinned hist histogram
+            """
+            # sort edges and indices, by default they are sorted
+            ascending_order = np.argsort(edges)
+            edges, indices = edges[ascending_order], indices[ascending_order]
+
+            # create new hist and add axes with coresponding edges
+            # define new axes, from old histogram and rebinned variable with new axis
+            axes = (
+                [h.axes[axis] for axis in h.axes.name if axis not in variable] +
+                [hist.axis.Variable(edges, name=variable, label=f"{variable}-flat-s")]
+            )
+
+            new_hist = hist.Hist(*axes, storage=hist.storage.Weight())
+
+            # slice the old histogram storage view with new edges
+            # sum over sliced bin contents to get rebinned content
+            slices = [slice(int(indices[index]), int(indices[index + 1])) for index in range(0, len(indices) - 1)]
+            slice_array = [np.sum(h.view()[..., _slice], axis=-1, keepdims=True) for _slice in slices]
+            # concatenate the slices to get the new bin content
+            # store in new histogram storage view
+            np.concatenate(slice_array, axis=-1, out=new_hist.view())
+
+            return new_hist
+
+        import hist
+        n_bins = 10
+        # find signal histogram for which you will optimize, only 1 signal process is allowed
+        background_hists = {}
+        for process, histogram in hists.items():
+            if process.has_tag("signal"):
+                signal_proc = process
+                signal_hist = histogram
+            else:
+                background_hists[process] = histogram
+
+        if not signal_proc:
+            logger.warning("could not find any signal process, return hist unchanged")
+            return hists
+
+        # 1. preparation
+        # get the leaf categories (e.g. {etau,mutau}__os__iso)
+        leaf_cats = task.config_inst.get_category(task.branch_data.category).get_leaf_categories()
+
+        # sum over different leaf categories
+        cat_ids_locations = [hist.loc(category.id) for category in leaf_cats]
+        combined_signal_hist = signal_hist[{"category": cat_ids_locations}]
+        combined_signal_hist = combined_signal_hist[{"category": sum}]
+        # remove shift axis, since its always nominal
+        combined_signal_hist = combined_signal_hist[{"shift": hist.loc(0)}]
+
+        # same for background
+        for process, histogram in background_hists.items():
+            combined_background_hist = histogram[{"category": cat_ids_locations}]
+            combined_background_hist = combined_background_hist[{"category": sum}]
+            combined_background_hist = combined_background_hist[{"shift": hist.loc(0)}]
+            background_hists[process] = combined_background_hist
+
+        # 2. determine bin edges
+        flat_s_edges, flat_s_indices = find_edges(
+            signal_histogram=combined_signal_hist,
+            variable=task.variables[0],
+            background_histograms=background_hists,
+            n_bins=n_bins,
+        )
+
+        # 3. apply to hists
+        for process, histogram in hists.items():
+            hists[process] = apply_edges(
+                histogram,
+                flat_s_edges,
+                flat_s_indices,
+                task.variables[0],
+            )
+
+        return hists
+
     config.x.hist_hooks = {
         "qcd": qcd_estimation,
+        "flat_s": flat_s,
     }
diff --git a/hbt/config/variables.py b/hbt/config/variables.py
@@ -264,3 +264,10 @@ def add_variables(config: od.Config) -> None:
             binning=(25, 0.0, 1.0),
             x_title=rf"{proc.upper()} output node, res. DNN",
         )
+
+        config.add_variable(
+            name=f"res_dnn_{proc}_fine",
+            expression=f"res_dnn_{proc}",
+            binning=(5000, 0.0, 1.0),
+            x_title=rf"{proc.upper()} output node, res. DNN",
+        )