-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
267 lines (206 loc) · 7.83 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
#!/usr/bin/python
import json
import pickle as pkl
import warnings
from typing import List
import numpy as np
import scipy
from hist import Hist
from systematics import sigs
warnings.filterwarnings("ignore", message="Found duplicate branch ")
combine_samples_by_name = {
"GluGluHToWW_Pt-200ToInf_M-125": "ggF",
"VBFHToWWToAny_M-125_TuneCP5_withDipoleRecoil": "VBF",
"ttHToNonbb_M125": "ttH",
"HWminusJ_HToWW_M-125": "WH",
"HWplusJ_HToWW_M-125": "WH",
"HZJ_HToWW_M-125": "ZH",
"GluGluZH_HToWW_M-125_TuneCP5_13TeV-powheg-pythia8": "ZH", # TODO: skip
"GluGluHToTauTau": "HTauTau",
}
combine_samples = {
# data
"SingleElectron_": "Data",
"SingleMuon_": "Data",
"EGamma_": "Data",
# bkg
"TT": "TTbar",
"WJetsToLNu_": "WJetsLNu",
"ST_": "SingleTop",
"WW": "Diboson",
"WZ": "Diboson",
"ZZ": "Diboson",
"EWK": "EWKvjets",
"DYJets": "DYJets",
"JetsToQQ": "WZQQ",
# TODO: make sure it's WZQQ is NLO in next iteration
# "DYJets": "WZQQorDYJets",
# "JetsToQQ": "WZQQorDYJets",
}
# (name in templates, name in cards)
labels = {
# sigs
"ggF": "ggH_hww",
"VBF": "qqH_hww",
"ttH": "ttH_hww",
"WH": "WH_hww",
"ZH": "ZH_hww",
# BKGS
"WJetsLNu": "wjets",
"TTbar": "ttbar",
"SingleTop": "singletop",
"Diboson": "diboson",
"EWKvjets": "ewkvjets",
# TODO: make sure it's WZQQ is NLO in next iteration
"DYJets": "zjets",
"WZQQ": "wzqq",
# "WZQQorDYJets": "vjets",
"Fake": "fake",
}
def get_common_sample_name(sample):
# first: check if the sample is in one of combine_samples_by_name
sample_to_use = None
for key in combine_samples_by_name:
if key in sample:
sample_to_use = combine_samples_by_name[key]
break
# second: if not, combine under common label
if sample_to_use is None:
for key in combine_samples:
if key in sample:
sample_to_use = combine_samples[key]
break
else:
sample_to_use = sample
return sample_to_use
def get_sum_sumgenweight(pkl_files, year, sample):
sum_sumgenweight = 0
for ifile in pkl_files:
# load and sum the sumgenweight of each
with open(ifile, "rb") as f:
metadata = pkl.load(f)
sum_sumgenweight = sum_sumgenweight + metadata[sample][year]["sumgenweight"]
return sum_sumgenweight
def get_sum_sumpdfweight(pkl_files, year, sample, sample_to_use):
if (sample_to_use in sigs + ["WJetsLNu", "TTbar"]) and (sample != "ST_s-channel_4f_hadronicDecays"):
sum_sumpdfweight = {}
for key in range(103):
sum_sumpdfweight[key] = 0
for ifile in pkl_files:
# load and sum the sumgenweight of each
with open(ifile, "rb") as f:
metadata = pkl.load(f)
for key in range(103):
sum_sumpdfweight[key] = sum_sumpdfweight[key] + metadata[sample][year]["sumpdfweight"][key]
return sum_sumpdfweight
else:
return 1
def get_sum_sumscsaleweight(pkl_files, year, sample, sample_to_use):
if (sample_to_use in sigs + ["WJetsLNu", "TTbar", "SingleTop"]) and (sample != "ST_s-channel_4f_hadronicDecays"):
sum_sumlheweight = {}
for key in [0, 1, 3, 5, 7, 8, 4]:
sum_sumlheweight[key] = 0
for ifile in pkl_files:
# load and sum the sumgenweight of each
with open(ifile, "rb") as f:
metadata = pkl.load(f)
for key in [0, 1, 3, 5, 7, 8, 4]:
sum_sumlheweight[key] = sum_sumlheweight[key] + metadata[sample][year]["sumlheweight"][key]
return sum_sumlheweight
else:
return 1
def get_xsecweight(pkl_files, year, sample, sample_to_use, is_data, luminosity):
"""
Returns the xsec*lumi / [sumgenweight, sumlheweight, or sumpdfweight]
"""
xsecweight = {}
if not is_data:
# find xsection
f = open("../fileset/xsec_pfnano.json")
xsec = json.load(f)
f.close()
try:
xsec = eval(str((xsec[sample])))
except ValueError:
print(f"sample {sample} doesn't have xsecs defined in xsec_pfnano.json so will skip it")
return None
# get overall weighting of events.. each event has a genweight...
# sumgenweight sums over events in a chunk... sum_sumgenweight sums over chunks
sumgenweights = get_sum_sumgenweight(pkl_files, year, sample)
sumpdfweights = get_sum_sumpdfweight(pkl_files, year, sample, sample_to_use)
sumscaleweights = get_sum_sumscsaleweight(pkl_files, year, sample, sample_to_use)
xsecweight = (xsec * luminosity) / sumgenweights
return xsecweight, sumgenweights, sumpdfweights, sumscaleweights
else:
return 1, 1, 1, 1
def shape_to_num(var, nom, clip=1.5):
"""
Estimates the normalized rate from a shape systematic by integrating and dividing by the nominal integrated value.
"""
nom_rate = np.sum(nom)
var_rate = np.sum(var)
#print("nom_rate", nom_rate)
#print("var_rate", var_rate)
if var_rate == nom_rate: # in cases when both are zero
return 1
if abs(var_rate / nom_rate) > clip:
var_rate = clip * nom_rate
if var_rate < 0:
var_rate = 0
return var_rate / nom_rate
def get_template(h, sample, region):
# massbins = h.axes["mass_observable"].edges
# return (h[{"Sample": sample, "Systematic": "nominal", "Category": category}].values(), massbins, "mass_observable")
return h[{"Sample": sample, "Systematic": "nominal", "Region": region}]
def get_template_diffbins(h, sample):
# massbins = h.axes["mass_observable"].edges
# return (h[{"Sample": sample, "Systematic": "nominal", "Category": category}].values(), massbins, "mass_observable")
if sample not in h.axes["Sample"]:
return 0
return h[{"Sample": sample, "Systematic": "nominal"}]
def load_templates(years, lep_channels, outdir):
"""Loads the hist templates that were created using ```make_templates.py```."""
if len(years) == 4:
save_as = "Run2"
else:
save_as = "_".join(years)
if len(lep_channels) == 1:
save_as += f"_{lep_channels[0]}_"
with open(f"{outdir}/hists_templates_{save_as}.pkl", "rb") as f:
hists_templates = pkl.load(f)
return hists_templates
def blindBins(h: Hist, blind_region: List = [90, 160], blind_samples: List[str] = []):
"""
Blind (i.e. zero) bins in histogram ``h``.
If ``blind_samples`` specified, only blind those samples, else blinds all.
CAREFUL: assumes axis=0 is samples, axis=3 is mass_axis
"""
h = h.copy()
massbins = h.axes["mass_observable"].edges
lv = int(np.searchsorted(massbins, blind_region[0], "right"))
rv = int(np.searchsorted(massbins, blind_region[1], "left") + 1)
if len(blind_samples) >= 1:
for blind_sample in blind_samples:
sample_index = np.argmax(np.array(list(h.axes[0])) == blind_sample)
# h.view(flow=True)[sample_index, :, :, lv:rv] = 0
h.view(flow=True)[sample_index, :, :, lv:rv].value = 0
h.view(flow=True)[sample_index, :, :, lv:rv].variance = 0
else:
# h.view(flow=True)[:, :, :, lv:rv] = 0
h.view(flow=True)[:, :, :, lv:rv].value = 0
h.view(flow=True)[:, :, :, lv:rv].variance = 0
return h
def get_finetuned_score(data, model_path):
import onnx
import onnxruntime as ort
input_dict = {
"highlevel": data.loc[:, "fj_ParT_hidNeuron000":"fj_ParT_hidNeuron127"].values.astype("float32"),
}
onnx_model = onnx.load(model_path)
onnx.checker.check_model(onnx_model)
ort_sess = ort.InferenceSession(
model_path,
providers=["AzureExecutionProvider"],
)
outputs = ort_sess.run(None, input_dict)
return scipy.special.softmax(outputs[0], axis=1)[:, 0]