-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
701 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,238 @@ | ||
# coding: utf-8 | ||
|
||
""" | ||
Collection of helper functions for creating inference models | ||
""" | ||
|
||
import law | ||
import order as od | ||
|
||
from columnflow.inference import InferenceModel, ParameterType, ParameterTransformation | ||
from columnflow.util import maybe_import, DotDict | ||
from columnflow.config_util import get_datasets_from_process | ||
|
||
import hbt.inference.constants as const # noqa | ||
|
||
|
||
np = maybe_import("numpy") | ||
ak = maybe_import("awkward") | ||
|
||
logger = law.logger.get_logger(__name__) | ||
|
||
|
||
class HBTInferenceModelBase(InferenceModel): | ||
""" | ||
This is the base for all Inference models in our hbw analysis. | ||
""" | ||
|
||
# | ||
# Model attributes, can be changed via `cls.derive` | ||
# | ||
|
||
# default ml model, used for resolving defaults | ||
ml_model_name: str = "dense_default" | ||
|
||
# list of all processes/channels/systematics to include in the datacards | ||
processes: list = [] | ||
channels: list = [] | ||
systematics: list = [] | ||
|
||
# customization of channels | ||
mc_stats: bool = True | ||
skip_data: bool = True | ||
|
||
def __init__(self, config_inst: od.Config, *args, **kwargs): | ||
super().__init__(config_inst) | ||
|
||
self.add_inference_categories() | ||
self.add_inference_processes() | ||
self.add_inference_parameters() | ||
# self.print_model() | ||
|
||
# | ||
# post-processing | ||
# | ||
|
||
self.cleanup() | ||
|
||
def print_model(self): | ||
""" Helper to print categories, processes and parameters of the InferenceModel """ | ||
for cat in self.categories: | ||
print(f"{'=' * 20} {cat.name}") | ||
print(f"Variable {cat.config_variable} \nCategory {cat.config_category}") | ||
print(f"Processes {[p.name for p in cat.processes]}") | ||
print(f"Parameters {set().union(*[[param.name for param in proc.parameters] for proc in cat.processes])}") | ||
|
||
def cat_name(self: InferenceModel, config_cat_inst: od.Category): | ||
""" Function to determine inference category name from config category """ | ||
# root_cats = config_cat_inst.x.root_cats | ||
final_name = f"cat_{config_cat_inst.name}" | ||
return final_name | ||
|
||
# def config_variable(self: InferenceModel, config_cat_inst: od.Config): | ||
# """ Function to determine inference variable name from config category """ | ||
# root_cats = config_cat_inst.x.root_cats | ||
# if dnn_cat := root_cats.get("dnn"): | ||
# dnn_proc = dnn_cat.replace("ml_", "") | ||
# return f"mlscore.{dnn_proc}" | ||
# else: | ||
# return "mli_mbb" | ||
|
||
def customize_category(self: InferenceModel, cat_inst: DotDict, config_cat_inst: od.Config): | ||
""" Function to allow customizing the inference category """ | ||
# root_cats = config_cat_inst.x.root_cats | ||
# variables = ["jet1_pt"] | ||
# if dnn_cat := root_cats.get("dnn"): | ||
# dnn_proc = dnn_cat.replace("ml_", "") | ||
# variables.append(f"mlscore.{dnn_proc}") | ||
cat_inst.variables_to_plot = self.config_variable | ||
|
||
def add_inference_categories(self: InferenceModel): | ||
""" | ||
This function creates categories for the inference model | ||
""" | ||
|
||
# get the MLModel inst | ||
# ml_model_inst = MLModel.get_cls(self.ml_model_name)(self.config_inst) | ||
for config_category in self.config_categories: | ||
cat_inst = self.config_inst.get_category(config_category) | ||
# root_cats = cat_inst.x.root_cats | ||
# lep_channel = root_cats.get("lep") | ||
# if lep_channel not in self.config_inst.x.lepton_channels: | ||
# raise Exception( | ||
# "Each inference category needs to be categorized based on number of leptons; " | ||
# f"Options: {self.config_inst.x.lepton_channels}", | ||
# ) | ||
|
||
cat_name = self.cat_name(cat_inst) | ||
cat_kwargs = dict( | ||
config_category=config_category, | ||
config_variable=self.config_variable, | ||
mc_stats=self.mc_stats, | ||
) | ||
if self.skip_data: | ||
cat_kwargs["data_from_processes"] = self.processes | ||
# else: | ||
# cat_kwargs["config_data_datasets"] = const.data_datasets[lep_channel] | ||
|
||
self.add_category(cat_name, **cat_kwargs) | ||
|
||
# do some customization of the inference category | ||
self.customize_category(self.get_category(cat_name), cat_inst) | ||
|
||
def add_inference_processes(self: InferenceModel): | ||
""" | ||
Function that adds processes with their corresponding datasets to all categories | ||
of the inference model | ||
""" | ||
used_datasets = set() | ||
for proc in self.processes: | ||
if not self.config_inst.has_process(proc): | ||
raise Exception(f"Process {proc} not included in the config {self.config_inst.name}") | ||
|
||
# get datasets corresponding to this process | ||
datasets = [ | ||
d.name for d in | ||
get_datasets_from_process(self.config_inst, proc, strategy="inclusive") | ||
] | ||
# hack for dy | ||
if proc == "dy": | ||
datasets = [x for x in datasets if x.startswith("dy_lep_pt")] | ||
|
||
# check that no dataset is used multiple times | ||
if datasets_already_used := used_datasets.intersection(datasets): | ||
raise Exception(f"{datasets_already_used} datasets are used for multiple processes") | ||
used_datasets |= set(datasets) | ||
|
||
self.add_process( | ||
const.inference_procnames.get(proc, proc), | ||
config_process=proc, | ||
is_signal=(any(x in proc for x in ["HH_", "hh_"])), | ||
config_mc_datasets=datasets, | ||
) | ||
|
||
def add_inference_parameters(self: InferenceModel): | ||
""" | ||
Function that adds all parameters (systematic variations) to the inference model | ||
""" | ||
# define the two relevant types of parameter groups | ||
self.add_parameter_group("experiment") | ||
self.add_parameter_group("theory") | ||
|
||
# add rate + shape parameters to the inference model | ||
self.add_rate_parameters() | ||
# self.add_shape_parameters() | ||
|
||
# TODO: check that all requested systematics are in final MLModel? | ||
|
||
def add_rate_parameters(self: InferenceModel): | ||
""" | ||
Function that adds all rate parameters to the inference model | ||
""" | ||
ecm = self.config_inst.campaign.ecm | ||
|
||
# lumi | ||
lumi = self.config_inst.x.luminosity | ||
for unc_name in lumi.uncertainties: | ||
if unc_name not in self.systematics: | ||
continue | ||
|
||
self.add_parameter( | ||
unc_name, | ||
type=ParameterType.rate_gauss, | ||
effect=lumi.get(names=unc_name, direction=("down", "up"), factor=True), | ||
transformations=[ParameterTransformation.symmetrize], | ||
) | ||
self.add_parameter_to_group(unc_name, "experiment") | ||
|
||
# add QCD scale (rate) uncertainties to inference model | ||
# TODO: combine scale and mtop uncertainties for specific processes? | ||
# TODO: some scale/pdf uncertainties should be rounded to 3 digits, others to 4 digits | ||
# NOTE: it might be easier to just take the recommended uncertainty values from HH conventions at | ||
# https://gitlab.cern.ch/hh/naming-conventions instead of taking the values from CMSDB | ||
for k, procs in const.processes_per_QCDScale.items(): | ||
syst_name = f"QCDScale_{k}" | ||
if syst_name not in self.systematics: | ||
continue | ||
|
||
for proc in procs: | ||
if proc not in self.processes: | ||
continue | ||
process_inst = self.config_inst.get_process(proc) | ||
if "scale" not in process_inst.xsecs[ecm]: | ||
continue | ||
self.add_parameter( | ||
syst_name, | ||
process=const.inference_procnames.get(proc, proc), | ||
type=ParameterType.rate_gauss, | ||
effect=tuple(map( | ||
lambda f: round(f, 3), | ||
process_inst.xsecs[ecm].get(names=("scale"), direction=("down", "up"), factor=True), | ||
)), | ||
) | ||
self.add_parameter_to_group(syst_name, "theory") | ||
|
||
# add PDF rate uncertainties to inference model | ||
for k, procs in const.processes_per_pdf_rate.items(): | ||
syst_name = f"pdf_{k}" | ||
if syst_name not in self.systematics: | ||
continue | ||
|
||
for proc in procs: | ||
if proc not in self.processes: | ||
continue | ||
process_inst = self.config_inst.get_process(proc) | ||
if "pdf" not in process_inst.xsecs[ecm]: | ||
continue | ||
|
||
self.add_parameter( | ||
f"pdf_{k}", | ||
process=const.inference_procnames.get(proc, proc), | ||
type=ParameterType.rate_gauss, | ||
effect=tuple(map( | ||
lambda f: round(f, 3), | ||
process_inst.xsecs[ecm].get(names=("pdf"), direction=("down", "up"), factor=True), | ||
)), | ||
) | ||
self.add_parameter_to_group(syst_name, "theory") | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
# coding: utf-8 | ||
|
||
""" | ||
Collection of configurations that stay constant for the analysis | ||
""" | ||
|
||
# collection of all signal processes | ||
signals_ggHH = { | ||
# "ggHH_kl_0_kt_1_sl_hbbhww", "ggHH_kl_1_kt_1_sl_hbbhww", | ||
# "ggHH_kl_2p45_kt_1_sl_hbbhww", "ggHH_kl_5_kt_1_sl_hbbhww", | ||
"graviton_hh_ggf_bbtautau_m400" | ||
} | ||
signals_qqHH = { | ||
# "qqHH_CV_1_C2V_1_kl_1_sl_hbbhww", "qqHH_CV_1_C2V_1_kl_0_sl_hbbhww", "qqHH_CV_1_C2V_1_kl_2_sl_hbbhww", | ||
# "qqHH_CV_1_C2V_0_kl_1_sl_hbbhww", "qqHH_CV_1_C2V_2_kl_1_sl_hbbhww", | ||
# "qqHH_CV_0p5_C2V_1_kl_1_sl_hbbhww", "qqHH_CV_1p5_C2V_1_kl_1_sl_hbbhww", | ||
"graviton_hh_vbf_bbtautau_m400", | ||
} | ||
signals = {*signals_ggHH, *signals_qqHH} | ||
|
||
# mapping between lepton categories and datasets (only 2017 ATM) | ||
data_datasets = { | ||
"1e": {f"data_e_{i}" for i in ["b", "c", "d", "e", "f"]}, | ||
"1mu": {f"data_mu_{i}" for i in ["b", "c", "d", "e", "f"]}, | ||
"2e": {"data_e_b"}, # TODO: 2 lep datasets in cmsdb + config | ||
"2mu": {"data_mu_b"}, # TODO | ||
"emu": {"data_mu_b"}, # TODO | ||
} | ||
merged_datasets = set().union(*data_datasets.values()) | ||
|
||
# mapping between process names in the config and inference model | ||
inference_procnames = { | ||
# key: config process name, value: inference model process name | ||
"foo": "bar", | ||
# "st": "ST", | ||
# "tt": "TT", | ||
} | ||
|
||
# mapping, which processes are used for which QCDScale (rate) uncertainty | ||
processes_per_QCDScale = { | ||
"ttbar": ["tt", "st_tchannel", "st_schannel", "st_twchannel", "ttW", "ttZ"], | ||
"V": ["dy_lep", "w_lnu"], | ||
"VV": ["WW", "ZZ", "WZ", "qqZZ"], | ||
"VVV": ["vvv"], | ||
"ggH": ["ggH"], | ||
"qqH": ["qqH"], | ||
"VH": ["ZH", "WH", "VH"], | ||
"ttH": ["ttH", "tHq", "tHW"], | ||
"bbH": ["bbH"], # contains also pdf and alpha_s | ||
# "ggHH": signals_ggHH, # included in inference model (THU_HH) | ||
"qqHH": signals_qqHH, | ||
"VHH": [], | ||
"ttHH": [], | ||
} | ||
|
||
# mapping, which processes are used for which pdf (rate) uncertainty | ||
processes_per_pdf_rate = { | ||
"gg": ["tt", "ttZ", "ggZZ"], | ||
"qqbar": ["st_schannel", "st_tchannel", "dy_lep", "w_lnu", "vvv", "qqZZ", "ttW"], | ||
"qg": ["st_twchannel"], | ||
"Higgs_gg": ["ggH"], | ||
"Higgs_qqbar": ["qqH", "ZH", "WH", "VH"], | ||
# "Higgs_qg": [], # none so far | ||
"Higgs_ttH": ["ttH", "tHq", "tHW"], | ||
# "Higgs_bbh": ["bbH"], # removed | ||
"Higgs_ggHH": signals_ggHH, | ||
"Higgs_qqHH": signals_qqHH, | ||
"Higgs_VHH": ["HHZ", "HHW+", "HHW-"], | ||
"Higgs_ttHH": ["ttHH"], | ||
} | ||
|
||
# mapping for each shape uncertainty, which process is used. | ||
# If "all" is included, takes all processes except for the ones specified (starting with !) | ||
processes_per_shape = { | ||
"btag_hf": ["all"], | ||
"btag_lf": ["all"], | ||
"btag_hfstats1_2017": ["all"], | ||
"btag_hfstats2_2017": ["all"], | ||
"btag_lfstats1_2017": ["all"], | ||
"btag_lfstats2_2017": ["all"], | ||
"btag_cferr1": ["all"], | ||
"btag_cferr2": ["all"], | ||
"mu_trig": ["all"], | ||
"e_sf": ["all"], | ||
"e_trig": ["all"], | ||
"minbias_xs": ["all"], | ||
"top_pt": ["all"], | ||
"pdf_shape_ggHH_kl_0_kt_1_sl_hbbhww": ["ggHH_kl_0_kt_1_sl_hbbhww"], | ||
"pdf_shape_ggHH_kl_1_kt_1_sl_hbbhww": ["ggHH_kl_1_kt_1_sl_hbbhww"], | ||
"pdf_shape_ggHH_kl_2p45_kt_1_sl_hbbhww": ["ggHH_kl_2p45_kt_1_sl_hbbhww"], | ||
"pdf_shape_ggHH_kl_5_kt_1_sl_hbbhww": ["ggHH_kl_5_kt_1_sl_hbbhww"], | ||
"pdf_shape_tt": ["tt"], | ||
"pdf_shape_st": ["st_schannel", "st_twchannel"], # TODO: there was some bug with "st_tchannel" | ||
"pdf_shape_dy": ["dy_lep"], | ||
"pdf_shape_w": ["w_lnu"], | ||
"murf_envelope_ggHH_kl_0_kt_1_sl_hbbhww": ["ggHH_kl_0_kt_1_sl_hbbhww"], | ||
"murf_envelope_ggHH_kl_1_kt_1_sl_hbbhww": ["ggHH_kl_1_kt_1_sl_hbbhww"], | ||
"murf_envelope_ggHH_kl_2p45_kt_1_sl_hbbhww": ["ggHH_kl_2p45_kt_1_sl_hbbhww"], | ||
"murf_envelope_ggHH_kl_5_kt_1_sl_hbbhww": ["ggHH_kl_5_kt_1_sl_hbbhww"], | ||
"murf_envelope_tt": ["tt"], | ||
"murf_envelope_st": ["st_schannel", "st_tchannel", "st_twchannel"], | ||
"murf_envelope_dy": ["dy_lep"], | ||
"murf_envelope_w": ["w_lnu"], | ||
} | ||
|
||
# mapping for each shape uncertainty, which shift source is used | ||
# per default: shape and source have the same name (except for pdf and murf, which are implemented per process) | ||
source_per_shape = {shape: shape for shape in processes_per_shape.keys()} | ||
for shape in processes_per_shape.keys(): | ||
if "pdf_shape" in shape: | ||
source_per_shape[shape] = "pdf" | ||
elif "murf_envelope" in shape: | ||
source_per_shape[shape] = "murf_envelope" |
Oops, something went wrong.