Skip to content

Commit 10069a7

Browse files
Add the flat-s hist hook as option for histograms
Within our histogram pipeline no rebinning is currently possible. This kind of feature is useful for inference models in combine. implements a hist-hook called "flat-s" that takes a finely binned histogram and rebins the histogram for given constraints. The hist-hook is split in 2 parts: - finding edges that full-fill the constraints - apply edges on given "Hist" histograms
1 parent 902b7bf commit 10069a7

File tree

3 files changed

+283
-0
lines changed

3 files changed

+283
-0
lines changed

hbt/config/configs_hbt.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,10 @@ def if_era(
147147
if process_name.startswith(("graviton_hh_", "radion_hh_")):
148148
proc.add_tag("signal")
149149
proc.add_tag("resonant_signal")
150+
if process_name.startswith("tt"):
151+
proc.add_tag("is_ttbar")
152+
if process_name.startswith("dy"):
153+
proc.add_tag("is_dy")
150154

151155
# add the process
152156
cfg.add_process(proc)

hbt/config/hist_hooks.py

Lines changed: 272 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,278 @@ def qcd_estimation(task, hists):
187187

188188
return hists
189189

190+
def flat_s(task, hists: dict[hist.Histogram]) -> dict[hist.Histogram]:
191+
"""Rebinnig of the histograms in *hists* to archieve a flat-signal distribution.
192+
Args:
193+
194+
task (TODO): task instance that contains the process informations
195+
hists (Dict[hist.Histogram]): A dictionary of histograms using Process instances as keys
196+
197+
Returns:
198+
Dict[hist.Histogram]: A dictionary of histograms using Process instances as keys
199+
"""
200+
def find_edges(signal_histogram, background_histograms, variable, n_bins=10) -> tuple[np.ndarray, np.ndarray]:
201+
"""
202+
Determine new bin edges that result in a flat signal distribution.
203+
The edges are determined by the signal distribution, while the background distribution
204+
is used to ensure that the background yield in each bin is sufficient.
205+
"""
206+
def get_integral(cumulative_weights, stop, offset=0):
207+
"""
208+
Helper to calculate the integral of *cumulative_weights* between the *offset* (included)
209+
and the *stop* index (not included)
210+
"""
211+
return cumulative_weights[stop - 1] - (0 if offset == 0 else cumulative_weights[offset - 1])
212+
213+
def prepare_background(histogram: hist.Histogram) -> tuple[np.ndarray, np.ndarray, np.ndarray]: # noqa
214+
"""
215+
Helper to extract information from background histograms.
216+
217+
Returns:
218+
tuple[np.ndarray]: A tuple containing the array that describe bin yield,
219+
the number of equivalent bins and the cumulative bin yield.
220+
"""
221+
bin_yield = histogram.counts()
222+
# y^2 / sigma^2, where y is the yield, sigma is the uncertainty
223+
# these are the number of events with weight 1 and same statistical fluctuation
224+
number_of_equivalent_bins = bin_yield**2 / histogram.variances()
225+
bin_yield = np.flip(bin_yield, axis=-1)
226+
cumulative_bin_yield = np.cumsum(bin_yield, axis=0)
227+
return (
228+
bin_yield,
229+
np.flip(number_of_equivalent_bins, axis=-1),
230+
cumulative_bin_yield,
231+
)
232+
# prepare parameters
233+
low_edge, max_edge = 0, 1
234+
bin_edges = [max_edge]
235+
indices_gathering = [0]
236+
237+
# bookkeep reasons for stopping binning
238+
stop_reason = ""
239+
# accumulated signal yield up to the current index
240+
y_already_binned = 0.0
241+
y_min = 1.0e-5
242+
# during binning, do not remove leading entries
243+
# instead remember the index that denotes the start of the bin
244+
offset = 0
245+
246+
# prepare signal
247+
# fine binned histograms bin centers are approx equivalent to dnn output
248+
# flip arrays to start from the right
249+
dnn_score_signal = np.flip(signal_histogram.axes[variable].centers, axis=-1)
250+
y_signal = np.flip(signal_histogram.counts(), axis=-1)
251+
252+
# calculate cumulative of reversed signal yield and yield per bin
253+
cumulu_y_signal = np.cumsum(y_signal, axis=0)
254+
full_cum = cumulu_y_signal[-1]
255+
y_per_bin = full_cum / n_bins
256+
num_events = len(cumulu_y_signal)
257+
258+
# prepare background
259+
260+
for process, histogram in background_histograms.items():
261+
if process.name == "tt":
262+
tt_y, tt_num_eq, cumulu_tt_y = prepare_background(histogram)
263+
elif process.name == "dy":
264+
dy_y, dy_num_eq, cumulu_dy_y = prepare_background(histogram)
265+
266+
# start binning
267+
while len(bin_edges) < n_bins:
268+
# stopping condition 1: reached end of events
269+
if offset >= num_events:
270+
stop_reason = "no more events left"
271+
break
272+
# stopping condition 2: remaining signal yield too small
273+
# this would lead to a bin complelty filled with background
274+
y_remaining = full_cum - y_already_binned
275+
if y_remaining < y_min:
276+
stop_reason = "remaining signal yield insufficient"
277+
break
278+
# find the index of the event that would result in a signal yield increase of more
279+
# than the expected per-bin yield;
280+
# this index would mark the start of the next bin given all constraints are met
281+
if y_remaining >= y_per_bin:
282+
threshold = y_already_binned + y_per_bin
283+
# get indices of array of values above threshold
284+
# first entry defines the next bin edge
285+
# shift next idx by offset
286+
next_idx = offset + np.where(cumulu_y_signal[offset:] > threshold)[0][0]
287+
else:
288+
# special case: remaining signal yield smaller than the expected per-bin yield,
289+
# so find the last event
290+
next_idx = offset + np.where(cumulu_y_signal[offset:])[0][-1] + 1
291+
292+
# advance the index until backgrounds constraints are met
293+
294+
# combine tt and dy histograms
295+
296+
# Background constraints
297+
while next_idx < num_events:
298+
# get the number of monte carlo tt and dy events
299+
tt_num_events = get_integral(tt_num_eq, next_idx, offset)
300+
dy_num_events = get_integral(tt_num_eq, next_idx, offset)
301+
tt_yield = get_integral(cumulu_tt_y, next_idx, offset)
302+
dy_yield = get_integral(cumulu_dy_y, next_idx, offset)
303+
304+
# evaluate constraints
305+
# TODO: potentially relax constraints here, e.g when there are 3 (4?) tt events, drop the constraint
306+
# on dy, and vice-versa
307+
constraints_met = (
308+
# have atleast 1 tt, 1 dy and atleast 4 background events
309+
# scale by lumi ratio to be more fair to the smaller dataset
310+
tt_num_events >= 1 and
311+
dy_num_events >= 1 and
312+
tt_num_events + dy_num_events >= 4 and
313+
314+
# yields must be positive to avoid negative sums of weights per process
315+
tt_yield > 0 and
316+
dy_yield > 0
317+
)
318+
if constraints_met:
319+
# TODO: maybe also check if the background conditions are just barely met and advance next_idx
320+
# to the middle between the current value and the next one that would change anything about the
321+
# background predictions; this might be more stable as the current implementation can highly
322+
# depend on the exact value of a single event (the one that tips the constraints over the edge
323+
# to fulfillment)
324+
# bin found, stop
325+
break
326+
327+
# constraints not met, advance index to include the next tt or dy event and try again
328+
next_idx += 1
329+
else:
330+
# stopping condition 3: no more events left, so the last bin (most left one) does not fullfill
331+
# constraints; however, this should practically never happen
332+
stop_reason = "no more events left while trying to fulfill constraints"
333+
break
334+
335+
# next_idx found, update values
336+
# get next edge or set to low edge if end is reached
337+
if next_idx == num_events:
338+
edge_value = low_edge
339+
else:
340+
# calculate bin center as new edge
341+
edge_value = float(dnn_score_signal[next_idx - 1:next_idx + 1].mean())
342+
# prevent out of bounds values and push them to the boundaries
343+
bin_edges.append(max(min(edge_value, max_edge), low_edge))
344+
345+
y_already_binned += get_integral(cumulu_y_signal, next_idx, offset)
346+
offset = next_idx
347+
indices_gathering.append(next_idx)
348+
349+
# make sure the lower dnn_output (max events) is included
350+
if bin_edges[-1] != low_edge:
351+
if len(bin_edges) > n_bins:
352+
raise RuntimeError(f"number of bins reached and initial bin edge is not minimal bin edge (edges: {bin_edges})") # noqa
353+
bin_edges.append(low_edge)
354+
indices_gathering.append(num_events)
355+
356+
# some debugging output
357+
n_bins_actual = len(bin_edges) - 1
358+
if n_bins_actual > n_bins:
359+
raise Exception("number of actual bins ended up larger than requested (implementation bug)")
360+
if n_bins_actual < n_bins:
361+
print(
362+
f" started from {num_events} bins, targeted {n_bins} but ended at {n_bins_actual} bins\n"
363+
f" -> reason: {stop_reason or 'NO REASON!?'}",
364+
)
365+
n_bins = n_bins_actual
366+
367+
# flip indices to the right order
368+
indices_gathering = (np.flip(indices_gathering) - num_events) * -1
369+
return np.flip(np.array(bin_edges), axis=-1), indices_gathering
370+
371+
def apply_edges(h: hist.Hist, edges: np.ndarray, indices: np.ndarray, variable: tuple[str]) -> hist.Hist:
372+
"""
373+
Rebin the content axes determined by *variables* of a given hist histogram *h* to
374+
given *edges* and their *indices*.
375+
The rebinned histogram is returned.
376+
377+
Args:
378+
h (hist.Hist): hist Histogram that is to be rebinned
379+
edges (np.ndarray): a array of ascending bin edges
380+
indices (np.ndarray): a array of indices that define the new bin edges
381+
variables (str): variable name that is rebinned
382+
383+
Returns:
384+
hist.Hist: rebinned hist histogram
385+
"""
386+
# sort edges and indices, by default they are sorted
387+
ascending_order = np.argsort(edges)
388+
edges, indices = edges[ascending_order], indices[ascending_order]
389+
390+
# create new hist and add axes with coresponding edges
391+
# define new axes, from old histogram and rebinned variable with new axis
392+
axes = (
393+
[h.axes[axis] for axis in h.axes.name if axis not in variable] +
394+
[hist.axis.Variable(edges, name=variable, label=f"{variable}-flat-s")]
395+
)
396+
397+
new_hist = hist.Hist(*axes, storage=hist.storage.Weight())
398+
399+
# slice the old histogram storage view with new edges
400+
# sum over sliced bin contents to get rebinned content
401+
slices = [slice(int(indices[index]), int(indices[index + 1])) for index in range(0, len(indices) - 1)]
402+
slice_array = [np.sum(h.view()[..., _slice], axis=-1, keepdims=True) for _slice in slices]
403+
# concatenate the slices to get the new bin content
404+
# store in new histogram storage view
405+
np.concatenate(slice_array, axis=-1, out=new_hist.view())
406+
407+
return new_hist
408+
409+
import hist
410+
n_bins = 10
411+
# find signal histogram for which you will optimize, only 1 signal process is allowed
412+
background_hists = {}
413+
for process, histogram in hists.items():
414+
if process.has_tag("signal"):
415+
signal_proc = process
416+
signal_hist = histogram
417+
else:
418+
background_hists[process] = histogram
419+
420+
if not signal_proc:
421+
logger.warning("could not find any signal process, return hist unchanged")
422+
return hists
423+
424+
# 1. preparation
425+
# get the leaf categories (e.g. {etau,mutau}__os__iso)
426+
leaf_cats = task.config_inst.get_category(task.branch_data.category).get_leaf_categories()
427+
428+
# sum over different leaf categories
429+
cat_ids_locations = [hist.loc(category.id) for category in leaf_cats]
430+
combined_signal_hist = signal_hist[{"category": cat_ids_locations}]
431+
combined_signal_hist = combined_signal_hist[{"category": sum}]
432+
# remove shift axis, since its always nominal
433+
combined_signal_hist = combined_signal_hist[{"shift": hist.loc(0)}]
434+
435+
# same for background
436+
for process, histogram in background_hists.items():
437+
combined_background_hist = histogram[{"category": cat_ids_locations}]
438+
combined_background_hist = combined_background_hist[{"category": sum}]
439+
combined_background_hist = combined_background_hist[{"shift": hist.loc(0)}]
440+
background_hists[process] = combined_background_hist
441+
442+
# 2. determine bin edges
443+
flat_s_edges, flat_s_indices = find_edges(
444+
signal_histogram=combined_signal_hist,
445+
variable=task.variables[0],
446+
background_histograms=background_hists,
447+
n_bins=n_bins,
448+
)
449+
450+
# 3. apply to hists
451+
for process, histogram in hists.items():
452+
hists[process] = apply_edges(
453+
histogram,
454+
flat_s_edges,
455+
flat_s_indices,
456+
task.variables[0],
457+
)
458+
459+
return hists
460+
190461
config.x.hist_hooks = {
191462
"qcd": qcd_estimation,
463+
"flat_s": flat_s,
192464
}

hbt/config/variables.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,3 +264,10 @@ def add_variables(config: od.Config) -> None:
264264
binning=(25, 0.0, 1.0),
265265
x_title=rf"{proc.upper()} output node, res. DNN",
266266
)
267+
268+
config.add_variable(
269+
name=f"res_dnn_{proc}_fine",
270+
expression=f"res_dnn_{proc}",
271+
binning=(5000, 0.0, 1.0),
272+
x_title=rf"{proc.upper()} output node, res. DNN",
273+
)

0 commit comments

Comments
 (0)