adding inference model grav_model

uhh-cms · Apr 23, 2024 · ad27dbe · ad27dbe
1 parent fd3d408
commit ad27dbe
Show file tree

Hide file tree

Showing 6 changed files with 701 additions and 1 deletion.
diff --git a/hbt/inference/base.py b/hbt/inference/base.py
@@ -0,0 +1,238 @@
+# coding: utf-8
+
+"""
+Collection of helper functions for creating inference models
+"""
+
+import law
+import order as od
+
+from columnflow.inference import InferenceModel, ParameterType, ParameterTransformation
+from columnflow.util import maybe_import, DotDict
+from columnflow.config_util import get_datasets_from_process
+
+import hbt.inference.constants as const  # noqa
+
+
+np = maybe_import("numpy")
+ak = maybe_import("awkward")
+
+logger = law.logger.get_logger(__name__)
+
+
+class HBTInferenceModelBase(InferenceModel):
+    """
+    This is the base for all Inference models in our hbw analysis.
+    """
+
+    #
+    # Model attributes, can be changed via `cls.derive`
+    #
+
+    # default ml model, used for resolving defaults
+    ml_model_name: str = "dense_default"
+
+    # list of all processes/channels/systematics to include in the datacards
+    processes: list = []
+    channels: list = []
+    systematics: list = []
+
+    # customization of channels
+    mc_stats: bool = True
+    skip_data: bool = True
+
+    def __init__(self, config_inst: od.Config, *args, **kwargs):
+        super().__init__(config_inst)
+
+        self.add_inference_categories()
+        self.add_inference_processes()
+        self.add_inference_parameters()
+        # self.print_model()
+
+        #
+        # post-processing
+        #
+
+        self.cleanup()
+
+    def print_model(self):
+        """ Helper to print categories, processes and parameters of the InferenceModel """
+        for cat in self.categories:
+            print(f"{'=' * 20} {cat.name}")
+            print(f"Variable {cat.config_variable} \nCategory {cat.config_category}")
+            print(f"Processes {[p.name for p in cat.processes]}")
+            print(f"Parameters {set().union(*[[param.name for param in proc.parameters] for proc in cat.processes])}")
+
+    def cat_name(self: InferenceModel, config_cat_inst: od.Category):
+        """ Function to determine inference category name from config category """
+        # root_cats = config_cat_inst.x.root_cats
+        final_name = f"cat_{config_cat_inst.name}"
+        return final_name
+
+    # def config_variable(self: InferenceModel, config_cat_inst: od.Config):
+    #     """ Function to determine inference variable name from config category """
+    #     root_cats = config_cat_inst.x.root_cats
+    #     if dnn_cat := root_cats.get("dnn"):
+    #         dnn_proc = dnn_cat.replace("ml_", "")
+    #         return f"mlscore.{dnn_proc}"
+    #     else:
+    #         return "mli_mbb"
+
+    def customize_category(self: InferenceModel, cat_inst: DotDict, config_cat_inst: od.Config):
+        """ Function to allow customizing the inference category """
+        # root_cats = config_cat_inst.x.root_cats
+        # variables = ["jet1_pt"]
+        # if dnn_cat := root_cats.get("dnn"):
+        #     dnn_proc = dnn_cat.replace("ml_", "")
+        #     variables.append(f"mlscore.{dnn_proc}")
+        cat_inst.variables_to_plot = self.config_variable
+
+    def add_inference_categories(self: InferenceModel):
+        """
+        This function creates categories for the inference model
+        """
+
+        # get the MLModel inst
+        # ml_model_inst = MLModel.get_cls(self.ml_model_name)(self.config_inst)
+        for config_category in self.config_categories:
+            cat_inst = self.config_inst.get_category(config_category)
+            # root_cats = cat_inst.x.root_cats
+            # lep_channel = root_cats.get("lep")
+            # if lep_channel not in self.config_inst.x.lepton_channels:
+            #     raise Exception(
+            #         "Each inference category needs to be categorized based on number of leptons; "
+            #         f"Options: {self.config_inst.x.lepton_channels}",
+            #     )
+
+            cat_name = self.cat_name(cat_inst)
+            cat_kwargs = dict(
+                config_category=config_category,
+                config_variable=self.config_variable,
+                mc_stats=self.mc_stats,
+            )
+            if self.skip_data:
+                cat_kwargs["data_from_processes"] = self.processes
+            # else:
+            #     cat_kwargs["config_data_datasets"] = const.data_datasets[lep_channel]
+
+            self.add_category(cat_name, **cat_kwargs)
+
+            # do some customization of the inference category
+            self.customize_category(self.get_category(cat_name), cat_inst)
+
+    def add_inference_processes(self: InferenceModel):
+        """
+        Function that adds processes with their corresponding datasets to all categories
+        of the inference model
+        """
+        used_datasets = set()
+        for proc in self.processes:
+            if not self.config_inst.has_process(proc):
+                raise Exception(f"Process {proc} not included in the config {self.config_inst.name}")
+
+            # get datasets corresponding to this process
+            datasets = [
+                d.name for d in
+                get_datasets_from_process(self.config_inst, proc, strategy="inclusive")
+            ]
+            # hack for dy
+            if proc == "dy":
+                datasets = [x for x in datasets if x.startswith("dy_lep_pt")]
+
+            # check that no dataset is used multiple times
+            if datasets_already_used := used_datasets.intersection(datasets):
+                raise Exception(f"{datasets_already_used} datasets are used for multiple processes")
+            used_datasets |= set(datasets)
+
+            self.add_process(
+                const.inference_procnames.get(proc, proc),
+                config_process=proc,
+                is_signal=(any(x in proc for x in ["HH_", "hh_"])),
+                config_mc_datasets=datasets,
+            )
+
+    def add_inference_parameters(self: InferenceModel):
+        """
+        Function that adds all parameters (systematic variations) to the inference model
+        """
+        # define the two relevant types of parameter groups
+        self.add_parameter_group("experiment")
+        self.add_parameter_group("theory")
+
+        # add rate + shape parameters to the inference model
+        self.add_rate_parameters()
+        # self.add_shape_parameters()
+
+        # TODO: check that all requested systematics are in final MLModel?
+
+    def add_rate_parameters(self: InferenceModel):
+        """
+        Function that adds all rate parameters to the inference model
+        """
+        ecm = self.config_inst.campaign.ecm
+
+        # lumi
+        lumi = self.config_inst.x.luminosity
+        for unc_name in lumi.uncertainties:
+            if unc_name not in self.systematics:
+                continue
+
+            self.add_parameter(
+                unc_name,
+                type=ParameterType.rate_gauss,
+                effect=lumi.get(names=unc_name, direction=("down", "up"), factor=True),
+                transformations=[ParameterTransformation.symmetrize],
+            )
+            self.add_parameter_to_group(unc_name, "experiment")
+
+        # add QCD scale (rate) uncertainties to inference model
+        # TODO: combine scale and mtop uncertainties for specific processes?
+        # TODO: some scale/pdf uncertainties should be rounded to 3 digits, others to 4 digits
+        # NOTE: it might be easier to just take the recommended uncertainty values from HH conventions at
+        #       https://gitlab.cern.ch/hh/naming-conventions instead of taking the values from CMSDB
+        for k, procs in const.processes_per_QCDScale.items():
+            syst_name = f"QCDScale_{k}"
+            if syst_name not in self.systematics:
+                continue
+
+            for proc in procs:
+                if proc not in self.processes:
+                    continue
+                process_inst = self.config_inst.get_process(proc)
+                if "scale" not in process_inst.xsecs[ecm]:
+                    continue
+                self.add_parameter(
+                    syst_name,
+                    process=const.inference_procnames.get(proc, proc),
+                    type=ParameterType.rate_gauss,
+                    effect=tuple(map(
+                        lambda f: round(f, 3),
+                        process_inst.xsecs[ecm].get(names=("scale"), direction=("down", "up"), factor=True),
+                    )),
+                )
+            self.add_parameter_to_group(syst_name, "theory")
+
+        # add PDF rate uncertainties to inference model
+        for k, procs in const.processes_per_pdf_rate.items():
+            syst_name = f"pdf_{k}"
+            if syst_name not in self.systematics:
+                continue
+
+            for proc in procs:
+                if proc not in self.processes:
+                    continue
+                process_inst = self.config_inst.get_process(proc)
+                if "pdf" not in process_inst.xsecs[ecm]:
+                    continue
+
+                self.add_parameter(
+                    f"pdf_{k}",
+                    process=const.inference_procnames.get(proc, proc),
+                    type=ParameterType.rate_gauss,
+                    effect=tuple(map(
+                        lambda f: round(f, 3),
+                        process_inst.xsecs[ecm].get(names=("pdf"), direction=("down", "up"), factor=True),
+                    )),
+                )
+            self.add_parameter_to_group(syst_name, "theory")
+
diff --git a/hbt/inference/constants.py b/hbt/inference/constants.py
@@ -0,0 +1,113 @@
+# coding: utf-8
+
+"""
+Collection of configurations that stay constant for the analysis
+"""
+
+# collection of all signal processes
+signals_ggHH = {
+    # "ggHH_kl_0_kt_1_sl_hbbhww", "ggHH_kl_1_kt_1_sl_hbbhww",
+    # "ggHH_kl_2p45_kt_1_sl_hbbhww", "ggHH_kl_5_kt_1_sl_hbbhww",
+    "graviton_hh_ggf_bbtautau_m400"
+}
+signals_qqHH = {
+    # "qqHH_CV_1_C2V_1_kl_1_sl_hbbhww", "qqHH_CV_1_C2V_1_kl_0_sl_hbbhww", "qqHH_CV_1_C2V_1_kl_2_sl_hbbhww",
+    # "qqHH_CV_1_C2V_0_kl_1_sl_hbbhww", "qqHH_CV_1_C2V_2_kl_1_sl_hbbhww",
+    # "qqHH_CV_0p5_C2V_1_kl_1_sl_hbbhww", "qqHH_CV_1p5_C2V_1_kl_1_sl_hbbhww",
+    "graviton_hh_vbf_bbtautau_m400",
+}
+signals = {*signals_ggHH, *signals_qqHH}
+
+# mapping between lepton categories and datasets (only 2017 ATM)
+data_datasets = {
+    "1e": {f"data_e_{i}" for i in ["b", "c", "d", "e", "f"]},
+    "1mu": {f"data_mu_{i}" for i in ["b", "c", "d", "e", "f"]},
+    "2e": {"data_e_b"},  # TODO: 2 lep datasets in cmsdb + config
+    "2mu": {"data_mu_b"},  # TODO
+    "emu": {"data_mu_b"},  # TODO
+}
+merged_datasets = set().union(*data_datasets.values())
+
+# mapping between process names in the config and inference model
+inference_procnames = {
+    # key: config process name, value: inference model process name
+    "foo": "bar",
+    # "st": "ST",
+    # "tt": "TT",
+}
+
+# mapping, which processes are used for which QCDScale (rate) uncertainty
+processes_per_QCDScale = {
+    "ttbar": ["tt", "st_tchannel", "st_schannel", "st_twchannel", "ttW", "ttZ"],
+    "V": ["dy_lep", "w_lnu"],
+    "VV": ["WW", "ZZ", "WZ", "qqZZ"],
+    "VVV": ["vvv"],
+    "ggH": ["ggH"],
+    "qqH": ["qqH"],
+    "VH": ["ZH", "WH", "VH"],
+    "ttH": ["ttH", "tHq", "tHW"],
+    "bbH": ["bbH"],  # contains also pdf and alpha_s
+    # "ggHH": signals_ggHH,  # included in inference model (THU_HH)
+    "qqHH": signals_qqHH,
+    "VHH": [],
+    "ttHH": [],
+}
+
+# mapping, which processes are used for which pdf (rate) uncertainty
+processes_per_pdf_rate = {
+    "gg": ["tt", "ttZ", "ggZZ"],
+    "qqbar": ["st_schannel", "st_tchannel", "dy_lep", "w_lnu", "vvv", "qqZZ", "ttW"],
+    "qg": ["st_twchannel"],
+    "Higgs_gg": ["ggH"],
+    "Higgs_qqbar": ["qqH", "ZH", "WH", "VH"],
+    # "Higgs_qg": [],  # none so far
+    "Higgs_ttH": ["ttH", "tHq", "tHW"],
+    # "Higgs_bbh": ["bbH"],  # removed
+    "Higgs_ggHH": signals_ggHH,
+    "Higgs_qqHH": signals_qqHH,
+    "Higgs_VHH": ["HHZ", "HHW+", "HHW-"],
+    "Higgs_ttHH": ["ttHH"],
+}
+
+# mapping for each shape uncertainty, which process is used.
+# If "all" is included, takes all processes except for the ones specified (starting with !)
+processes_per_shape = {
+    "btag_hf": ["all"],
+    "btag_lf": ["all"],
+    "btag_hfstats1_2017": ["all"],
+    "btag_hfstats2_2017": ["all"],
+    "btag_lfstats1_2017": ["all"],
+    "btag_lfstats2_2017": ["all"],
+    "btag_cferr1": ["all"],
+    "btag_cferr2": ["all"],
+    "mu_trig": ["all"],
+    "e_sf": ["all"],
+    "e_trig": ["all"],
+    "minbias_xs": ["all"],
+    "top_pt": ["all"],
+    "pdf_shape_ggHH_kl_0_kt_1_sl_hbbhww": ["ggHH_kl_0_kt_1_sl_hbbhww"],
+    "pdf_shape_ggHH_kl_1_kt_1_sl_hbbhww": ["ggHH_kl_1_kt_1_sl_hbbhww"],
+    "pdf_shape_ggHH_kl_2p45_kt_1_sl_hbbhww": ["ggHH_kl_2p45_kt_1_sl_hbbhww"],
+    "pdf_shape_ggHH_kl_5_kt_1_sl_hbbhww": ["ggHH_kl_5_kt_1_sl_hbbhww"],
+    "pdf_shape_tt": ["tt"],
+    "pdf_shape_st": ["st_schannel", "st_twchannel"],  # TODO: there was some bug with "st_tchannel"
+    "pdf_shape_dy": ["dy_lep"],
+    "pdf_shape_w": ["w_lnu"],
+    "murf_envelope_ggHH_kl_0_kt_1_sl_hbbhww": ["ggHH_kl_0_kt_1_sl_hbbhww"],
+    "murf_envelope_ggHH_kl_1_kt_1_sl_hbbhww": ["ggHH_kl_1_kt_1_sl_hbbhww"],
+    "murf_envelope_ggHH_kl_2p45_kt_1_sl_hbbhww": ["ggHH_kl_2p45_kt_1_sl_hbbhww"],
+    "murf_envelope_ggHH_kl_5_kt_1_sl_hbbhww": ["ggHH_kl_5_kt_1_sl_hbbhww"],
+    "murf_envelope_tt": ["tt"],
+    "murf_envelope_st": ["st_schannel", "st_tchannel", "st_twchannel"],
+    "murf_envelope_dy": ["dy_lep"],
+    "murf_envelope_w": ["w_lnu"],
+}
+
+# mapping for each shape uncertainty, which shift source is used
+# per default: shape and source have the same name (except for pdf and murf, which are implemented per process)
+source_per_shape = {shape: shape for shape in processes_per_shape.keys()}
+for shape in processes_per_shape.keys():
+    if "pdf_shape" in shape:
+        source_per_shape[shape] = "pdf"
+    elif "murf_envelope" in shape:
+        source_per_shape[shape] = "murf_envelope"