@@ -187,6 +187,278 @@ def qcd_estimation(task, hists):
187
187
188
188
return hists
189
189
190
+ def flat_s (task , hists : dict [hist .Histogram ]) -> dict [hist .Histogram ]:
191
+ """Rebinnig of the histograms in *hists* to archieve a flat-signal distribution.
192
+ Args:
193
+
194
+ task (TODO): task instance that contains the process informations
195
+ hists (Dict[hist.Histogram]): A dictionary of histograms using Process instances as keys
196
+
197
+ Returns:
198
+ Dict[hist.Histogram]: A dictionary of histograms using Process instances as keys
199
+ """
200
+ def find_edges (signal_histogram , background_histograms , variable , n_bins = 10 ) -> tuple [np .ndarray , np .ndarray ]:
201
+ """
202
+ Determine new bin edges that result in a flat signal distribution.
203
+ The edges are determined by the signal distribution, while the background distribution
204
+ is used to ensure that the background yield in each bin is sufficient.
205
+ """
206
+ def get_integral (cumulative_weights , stop , offset = 0 ):
207
+ """
208
+ Helper to calculate the integral of *cumulative_weights* between the *offset* (included)
209
+ and the *stop* index (not included)
210
+ """
211
+ return cumulative_weights [stop - 1 ] - (0 if offset == 0 else cumulative_weights [offset - 1 ])
212
+
213
+ def prepare_background (histogram : hist .Histogram ) -> tuple [np .ndarray , np .ndarray , np .ndarray ]: # noqa
214
+ """
215
+ Helper to extract information from background histograms.
216
+
217
+ Returns:
218
+ tuple[np.ndarray]: A tuple containing the array that describe bin yield,
219
+ the number of equivalent bins and the cumulative bin yield.
220
+ """
221
+ bin_yield = histogram .counts ()
222
+ # y^2 / sigma^2, where y is the yield, sigma is the uncertainty
223
+ # these are the number of events with weight 1 and same statistical fluctuation
224
+ number_of_equivalent_bins = bin_yield ** 2 / histogram .variances ()
225
+ bin_yield = np .flip (bin_yield , axis = - 1 )
226
+ cumulative_bin_yield = np .cumsum (bin_yield , axis = 0 )
227
+ return (
228
+ bin_yield ,
229
+ np .flip (number_of_equivalent_bins , axis = - 1 ),
230
+ cumulative_bin_yield ,
231
+ )
232
+ # prepare parameters
233
+ low_edge , max_edge = 0 , 1
234
+ bin_edges = [max_edge ]
235
+ indices_gathering = [0 ]
236
+
237
+ # bookkeep reasons for stopping binning
238
+ stop_reason = ""
239
+ # accumulated signal yield up to the current index
240
+ y_already_binned = 0.0
241
+ y_min = 1.0e-5
242
+ # during binning, do not remove leading entries
243
+ # instead remember the index that denotes the start of the bin
244
+ offset = 0
245
+
246
+ # prepare signal
247
+ # fine binned histograms bin centers are approx equivalent to dnn output
248
+ # flip arrays to start from the right
249
+ dnn_score_signal = np .flip (signal_histogram .axes [variable ].centers , axis = - 1 )
250
+ y_signal = np .flip (signal_histogram .counts (), axis = - 1 )
251
+
252
+ # calculate cumulative of reversed signal yield and yield per bin
253
+ cumulu_y_signal = np .cumsum (y_signal , axis = 0 )
254
+ full_cum = cumulu_y_signal [- 1 ]
255
+ y_per_bin = full_cum / n_bins
256
+ num_events = len (cumulu_y_signal )
257
+
258
+ # prepare background
259
+
260
+ for process , histogram in background_histograms .items ():
261
+ if process .name == "tt" :
262
+ tt_y , tt_num_eq , cumulu_tt_y = prepare_background (histogram )
263
+ elif process .name == "dy" :
264
+ dy_y , dy_num_eq , cumulu_dy_y = prepare_background (histogram )
265
+
266
+ # start binning
267
+ while len (bin_edges ) < n_bins :
268
+ # stopping condition 1: reached end of events
269
+ if offset >= num_events :
270
+ stop_reason = "no more events left"
271
+ break
272
+ # stopping condition 2: remaining signal yield too small
273
+ # this would lead to a bin complelty filled with background
274
+ y_remaining = full_cum - y_already_binned
275
+ if y_remaining < y_min :
276
+ stop_reason = "remaining signal yield insufficient"
277
+ break
278
+ # find the index of the event that would result in a signal yield increase of more
279
+ # than the expected per-bin yield;
280
+ # this index would mark the start of the next bin given all constraints are met
281
+ if y_remaining >= y_per_bin :
282
+ threshold = y_already_binned + y_per_bin
283
+ # get indices of array of values above threshold
284
+ # first entry defines the next bin edge
285
+ # shift next idx by offset
286
+ next_idx = offset + np .where (cumulu_y_signal [offset :] > threshold )[0 ][0 ]
287
+ else :
288
+ # special case: remaining signal yield smaller than the expected per-bin yield,
289
+ # so find the last event
290
+ next_idx = offset + np .where (cumulu_y_signal [offset :])[0 ][- 1 ] + 1
291
+
292
+ # advance the index until backgrounds constraints are met
293
+
294
+ # combine tt and dy histograms
295
+
296
+ # Background constraints
297
+ while next_idx < num_events :
298
+ # get the number of monte carlo tt and dy events
299
+ tt_num_events = get_integral (tt_num_eq , next_idx , offset )
300
+ dy_num_events = get_integral (tt_num_eq , next_idx , offset )
301
+ tt_yield = get_integral (cumulu_tt_y , next_idx , offset )
302
+ dy_yield = get_integral (cumulu_dy_y , next_idx , offset )
303
+
304
+ # evaluate constraints
305
+ # TODO: potentially relax constraints here, e.g when there are 3 (4?) tt events, drop the constraint
306
+ # on dy, and vice-versa
307
+ constraints_met = (
308
+ # have atleast 1 tt, 1 dy and atleast 4 background events
309
+ # scale by lumi ratio to be more fair to the smaller dataset
310
+ tt_num_events >= 1 and
311
+ dy_num_events >= 1 and
312
+ tt_num_events + dy_num_events >= 4 and
313
+
314
+ # yields must be positive to avoid negative sums of weights per process
315
+ tt_yield > 0 and
316
+ dy_yield > 0
317
+ )
318
+ if constraints_met :
319
+ # TODO: maybe also check if the background conditions are just barely met and advance next_idx
320
+ # to the middle between the current value and the next one that would change anything about the
321
+ # background predictions; this might be more stable as the current implementation can highly
322
+ # depend on the exact value of a single event (the one that tips the constraints over the edge
323
+ # to fulfillment)
324
+ # bin found, stop
325
+ break
326
+
327
+ # constraints not met, advance index to include the next tt or dy event and try again
328
+ next_idx += 1
329
+ else :
330
+ # stopping condition 3: no more events left, so the last bin (most left one) does not fullfill
331
+ # constraints; however, this should practically never happen
332
+ stop_reason = "no more events left while trying to fulfill constraints"
333
+ break
334
+
335
+ # next_idx found, update values
336
+ # get next edge or set to low edge if end is reached
337
+ if next_idx == num_events :
338
+ edge_value = low_edge
339
+ else :
340
+ # calculate bin center as new edge
341
+ edge_value = float (dnn_score_signal [next_idx - 1 :next_idx + 1 ].mean ())
342
+ # prevent out of bounds values and push them to the boundaries
343
+ bin_edges .append (max (min (edge_value , max_edge ), low_edge ))
344
+
345
+ y_already_binned += get_integral (cumulu_y_signal , next_idx , offset )
346
+ offset = next_idx
347
+ indices_gathering .append (next_idx )
348
+
349
+ # make sure the lower dnn_output (max events) is included
350
+ if bin_edges [- 1 ] != low_edge :
351
+ if len (bin_edges ) > n_bins :
352
+ raise RuntimeError (f"number of bins reached and initial bin edge is not minimal bin edge (edges: { bin_edges } )" ) # noqa
353
+ bin_edges .append (low_edge )
354
+ indices_gathering .append (num_events )
355
+
356
+ # some debugging output
357
+ n_bins_actual = len (bin_edges ) - 1
358
+ if n_bins_actual > n_bins :
359
+ raise Exception ("number of actual bins ended up larger than requested (implementation bug)" )
360
+ if n_bins_actual < n_bins :
361
+ print (
362
+ f" started from { num_events } bins, targeted { n_bins } but ended at { n_bins_actual } bins\n "
363
+ f" -> reason: { stop_reason or 'NO REASON!?' } " ,
364
+ )
365
+ n_bins = n_bins_actual
366
+
367
+ # flip indices to the right order
368
+ indices_gathering = (np .flip (indices_gathering ) - num_events ) * - 1
369
+ return np .flip (np .array (bin_edges ), axis = - 1 ), indices_gathering
370
+
371
+ def apply_edges (h : hist .Hist , edges : np .ndarray , indices : np .ndarray , variable : tuple [str ]) -> hist .Hist :
372
+ """
373
+ Rebin the content axes determined by *variables* of a given hist histogram *h* to
374
+ given *edges* and their *indices*.
375
+ The rebinned histogram is returned.
376
+
377
+ Args:
378
+ h (hist.Hist): hist Histogram that is to be rebinned
379
+ edges (np.ndarray): a array of ascending bin edges
380
+ indices (np.ndarray): a array of indices that define the new bin edges
381
+ variables (str): variable name that is rebinned
382
+
383
+ Returns:
384
+ hist.Hist: rebinned hist histogram
385
+ """
386
+ # sort edges and indices, by default they are sorted
387
+ ascending_order = np .argsort (edges )
388
+ edges , indices = edges [ascending_order ], indices [ascending_order ]
389
+
390
+ # create new hist and add axes with coresponding edges
391
+ # define new axes, from old histogram and rebinned variable with new axis
392
+ axes = (
393
+ [h .axes [axis ] for axis in h .axes .name if axis not in variable ] +
394
+ [hist .axis .Variable (edges , name = variable , label = f"{ variable } -flat-s" )]
395
+ )
396
+
397
+ new_hist = hist .Hist (* axes , storage = hist .storage .Weight ())
398
+
399
+ # slice the old histogram storage view with new edges
400
+ # sum over sliced bin contents to get rebinned content
401
+ slices = [slice (int (indices [index ]), int (indices [index + 1 ])) for index in range (0 , len (indices ) - 1 )]
402
+ slice_array = [np .sum (h .view ()[..., _slice ], axis = - 1 , keepdims = True ) for _slice in slices ]
403
+ # concatenate the slices to get the new bin content
404
+ # store in new histogram storage view
405
+ np .concatenate (slice_array , axis = - 1 , out = new_hist .view ())
406
+
407
+ return new_hist
408
+
409
+ import hist
410
+ n_bins = 10
411
+ # find signal histogram for which you will optimize, only 1 signal process is allowed
412
+ background_hists = {}
413
+ for process , histogram in hists .items ():
414
+ if process .has_tag ("signal" ):
415
+ signal_proc = process
416
+ signal_hist = histogram
417
+ else :
418
+ background_hists [process ] = histogram
419
+
420
+ if not signal_proc :
421
+ logger .warning ("could not find any signal process, return hist unchanged" )
422
+ return hists
423
+
424
+ # 1. preparation
425
+ # get the leaf categories (e.g. {etau,mutau}__os__iso)
426
+ leaf_cats = task .config_inst .get_category (task .branch_data .category ).get_leaf_categories ()
427
+
428
+ # sum over different leaf categories
429
+ cat_ids_locations = [hist .loc (category .id ) for category in leaf_cats ]
430
+ combined_signal_hist = signal_hist [{"category" : cat_ids_locations }]
431
+ combined_signal_hist = combined_signal_hist [{"category" : sum }]
432
+ # remove shift axis, since its always nominal
433
+ combined_signal_hist = combined_signal_hist [{"shift" : hist .loc (0 )}]
434
+
435
+ # same for background
436
+ for process , histogram in background_hists .items ():
437
+ combined_background_hist = histogram [{"category" : cat_ids_locations }]
438
+ combined_background_hist = combined_background_hist [{"category" : sum }]
439
+ combined_background_hist = combined_background_hist [{"shift" : hist .loc (0 )}]
440
+ background_hists [process ] = combined_background_hist
441
+
442
+ # 2. determine bin edges
443
+ flat_s_edges , flat_s_indices = find_edges (
444
+ signal_histogram = combined_signal_hist ,
445
+ variable = task .variables [0 ],
446
+ background_histograms = background_hists ,
447
+ n_bins = n_bins ,
448
+ )
449
+
450
+ # 3. apply to hists
451
+ for process , histogram in hists .items ():
452
+ hists [process ] = apply_edges (
453
+ histogram ,
454
+ flat_s_edges ,
455
+ flat_s_indices ,
456
+ task .variables [0 ],
457
+ )
458
+
459
+ return hists
460
+
190
461
config .x .hist_hooks = {
191
462
"qcd" : qcd_estimation ,
463
+ "flat_s" : flat_s ,
192
464
}
0 commit comments