Skip to content

Commit 1d45a2e

Browse files
feat: save histograms without re-binning (#165)
* save histograms with original binning as produced by coffea to root files * re-bin histograms via cabinetry for statistical inference
1 parent b9407e6 commit 1d45a2e

File tree

4 files changed

+65
-27
lines changed

4 files changed

+65
-27
lines changed

analyses/cms-open-data-ttbar/cabinetry_config.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,11 @@ Regions:
99
- Name: "4j1b CR"
1010
RegionPath: "4j1b"
1111
Variable: "$H_T$ [GeV]"
12-
Binning: [ 50. , 95.45454545, 140.90909091, 186.36363636, 231.81818182, 277.27272727, 322.72727273, 368.18181818,413.63636364, 459.09090909, 504.54545455, 550. ]
12+
Binning: [110, 150, 190, 230, 270, 310, 350, 390, 430, 470, 510, 550]
1313
- Name: "4j2b SR"
1414
RegionPath: "4j2b"
1515
Variable: "$m_{bjj}$ [GeV]"
16-
Binning: [ 50. , 95.45454545, 140.90909091, 186.36363636, 231.81818182, 277.27272727, 322.72727273, 368.18181818,413.63636364, 459.09090909, 504.54545455, 550. ]
16+
Binning: [110, 150, 190, 230, 270, 310, 350, 390, 430, 470, 510, 550]
1717

1818
Samples:
1919
- Name: "Pseudodata"

analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.ipynb

+21-14
Large diffs are not rendered by default.

analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -503,12 +503,19 @@ def get_query(source: ObjectStream) -> ObjectStream:
503503
# %% [markdown]
504504
# ### Statistical inference
505505
#
506+
# We are going to perform a re-binning for the statistical inference.
507+
# This is planned to be conveniently provided via cabinetry (see [cabinetry#412](https://github.com/scikit-hep/cabinetry/issues/412), but in the meantime we can achieve this via [template building overrides](https://cabinetry.readthedocs.io/en/latest/advanced.html#overrides-for-template-building).
508+
# The implementation is provided in a function in `utils/`.
509+
#
506510
# A statistical model has been defined in `config.yml`, ready to be used with our output.
507511
# We will use `cabinetry` to combine all histograms into a `pyhf` workspace and fit the resulting statistical model to the pseudodata we built.
508512

509513
# %% tags=[]
510514
config = cabinetry.configuration.load("cabinetry_config.yml")
511-
cabinetry.templates.collect(config)
515+
516+
# rebinning: lower edge 110 GeV, merge bins 2->1
517+
rebinning_router = utils.get_cabinetry_rebinning_router(config, rebinning=slice(110j, None, hist.rebin(2)))
518+
cabinetry.templates.build(config, router=rebinning_router)
512519
cabinetry.templates.postprocess(config) # optional post-processing (e.g. smoothing)
513520
ws = cabinetry.workspace.build(config)
514521
cabinetry.workspace.save(ws, "workspace.json")

analyses/cms-open-data-ttbar/utils/__init__.py

+34-10
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import asyncio
22
import json
33

4+
import cabinetry
5+
from cabinetry.contrib import histogram_reader
46
import hist
57
import matplotlib as mpl
68
import matplotlib.pyplot as plt
@@ -97,31 +99,31 @@ def save_histograms(all_histograms, fileset, filename):
9799

98100
with uproot.recreate(filename) as f:
99101
for region in ["4j1b", "4j2b"]:
100-
f[f"{region}_pseudodata"] = pseudo_data[120j::hist.rebin(2), region]
102+
f[f"{region}_pseudodata"] = pseudo_data[:, region]
101103
for sample in nominal_samples:
102104
sample_name = sample.split("__")[0]
103-
f[f"{region}_{sample_name}"] = all_histograms[120j::hist.rebin(2), region, sample_name, "nominal"]
105+
f[f"{region}_{sample_name}"] = all_histograms[:, region, sample_name, "nominal"]
104106

105107
# b-tagging variations
106108
for i in range(4):
107109
for direction in ["up", "down"]:
108110
variation_name = f"btag_var_{i}_{direction}"
109-
f[f"{region}_{sample_name}_{variation_name}"] = all_histograms[120j::hist.rebin(2), region, sample_name, variation_name]
111+
f[f"{region}_{sample_name}_{variation_name}"] = all_histograms[:, region, sample_name, variation_name]
110112

111113
# jet energy scale variations
112114
for variation_name in ["pt_scale_up", "pt_res_up"]:
113-
f[f"{region}_{sample_name}_{variation_name}"] = all_histograms[120j::hist.rebin(2), region, sample_name, variation_name]
115+
f[f"{region}_{sample_name}_{variation_name}"] = all_histograms[:, region, sample_name, variation_name]
114116

115117
# ttbar modeling
116-
f[f"{region}_ttbar_ME_var"] = all_histograms[120j::hist.rebin(2), region, "ttbar", "ME_var"]
117-
f[f"{region}_ttbar_PS_var"] = all_histograms[120j::hist.rebin(2), region, "ttbar", "PS_var"]
118+
f[f"{region}_ttbar_ME_var"] = all_histograms[:, region, "ttbar", "ME_var"]
119+
f[f"{region}_ttbar_PS_var"] = all_histograms[:, region, "ttbar", "PS_var"]
118120

119-
f[f"{region}_ttbar_scaledown"] = all_histograms[120j :: hist.rebin(2), region, "ttbar", "scaledown"]
120-
f[f"{region}_ttbar_scaleup"] = all_histograms[120j :: hist.rebin(2), region, "ttbar", "scaleup"]
121+
f[f"{region}_ttbar_scaledown"] = all_histograms[:, region, "ttbar", "scaledown"]
122+
f[f"{region}_ttbar_scaleup"] = all_histograms[:, region, "ttbar", "scaleup"]
121123

122124
# W+jets scale
123-
f[f"{region}_wjets_scale_var_down"] = all_histograms[120j :: hist.rebin(2), region, "wjets", "scale_var_down"]
124-
f[f"{region}_wjets_scale_var_up"] = all_histograms[120j :: hist.rebin(2), region, "wjets", "scale_var_up"]
125+
f[f"{region}_wjets_scale_var_down"] = all_histograms[:, region, "wjets", "scale_var_down"]
126+
f[f"{region}_wjets_scale_var_up"] = all_histograms[:, region, "wjets", "scale_var_up"]
125127

126128

127129
class ServiceXDatasetGroup():
@@ -153,3 +155,25 @@ def get_data_rootfiles_uri(self, query, as_signed_url=True, title="Untitled"):
153155
files_per_process.update({process: all_files[parent_key[self.filelist[:,1]==process]]})
154156

155157
return files_per_process
158+
159+
160+
def get_cabinetry_rebinning_router(config, rebinning):
161+
# perform re-binning in cabinetry by providing a custom function reading histograms
162+
# will eventually be replaced via https://github.com/scikit-hep/cabinetry/issues/412
163+
rebinning_router = cabinetry.route.Router()
164+
165+
# this reimplements some of cabinetry.templates.collect
166+
general_path = config["General"]["InputPath"]
167+
variation_path = config["General"].get("VariationPath", None)
168+
169+
# define a custom template builder function that is executed for data samples
170+
@rebinning_router.register_template_builder()
171+
def build_data_hist(region, sample, systematic, template):
172+
# get path to histogram
173+
histo_path = cabinetry.templates.collector._histo_path(general_path, variation_path, region, sample, systematic, template)
174+
h = hist.Hist(histogram_reader.with_uproot(histo_path)) # turn from boost-histogram into hist
175+
# perform re-binning
176+
h = h[rebinning]
177+
return h
178+
179+
return rebinning_router

0 commit comments

Comments
 (0)