Skip to content

Commit

Permalink
adding inference model grav_model
Browse files Browse the repository at this point in the history
  • Loading branch information
aalvesan committed Apr 23, 2024
1 parent fd3d408 commit ad27dbe
Show file tree
Hide file tree
Showing 6 changed files with 701 additions and 1 deletion.
238 changes: 238 additions & 0 deletions hbt/inference/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
# coding: utf-8

"""
Collection of helper functions for creating inference models
"""

import law
import order as od

from columnflow.inference import InferenceModel, ParameterType, ParameterTransformation
from columnflow.util import maybe_import, DotDict
from columnflow.config_util import get_datasets_from_process

import hbt.inference.constants as const # noqa


np = maybe_import("numpy")
ak = maybe_import("awkward")

logger = law.logger.get_logger(__name__)


class HBTInferenceModelBase(InferenceModel):
"""
This is the base for all Inference models in our hbw analysis.
"""

#
# Model attributes, can be changed via `cls.derive`
#

# default ml model, used for resolving defaults
ml_model_name: str = "dense_default"

# list of all processes/channels/systematics to include in the datacards
processes: list = []
channels: list = []
systematics: list = []

# customization of channels
mc_stats: bool = True
skip_data: bool = True

def __init__(self, config_inst: od.Config, *args, **kwargs):
super().__init__(config_inst)

self.add_inference_categories()
self.add_inference_processes()
self.add_inference_parameters()
# self.print_model()

#
# post-processing
#

self.cleanup()

def print_model(self):
""" Helper to print categories, processes and parameters of the InferenceModel """
for cat in self.categories:
print(f"{'=' * 20} {cat.name}")
print(f"Variable {cat.config_variable} \nCategory {cat.config_category}")
print(f"Processes {[p.name for p in cat.processes]}")
print(f"Parameters {set().union(*[[param.name for param in proc.parameters] for proc in cat.processes])}")

def cat_name(self: InferenceModel, config_cat_inst: od.Category):
""" Function to determine inference category name from config category """
# root_cats = config_cat_inst.x.root_cats
final_name = f"cat_{config_cat_inst.name}"
return final_name

# def config_variable(self: InferenceModel, config_cat_inst: od.Config):
# """ Function to determine inference variable name from config category """
# root_cats = config_cat_inst.x.root_cats
# if dnn_cat := root_cats.get("dnn"):
# dnn_proc = dnn_cat.replace("ml_", "")
# return f"mlscore.{dnn_proc}"
# else:
# return "mli_mbb"

def customize_category(self: InferenceModel, cat_inst: DotDict, config_cat_inst: od.Config):
""" Function to allow customizing the inference category """
# root_cats = config_cat_inst.x.root_cats
# variables = ["jet1_pt"]
# if dnn_cat := root_cats.get("dnn"):
# dnn_proc = dnn_cat.replace("ml_", "")
# variables.append(f"mlscore.{dnn_proc}")
cat_inst.variables_to_plot = self.config_variable

def add_inference_categories(self: InferenceModel):
"""
This function creates categories for the inference model
"""

# get the MLModel inst
# ml_model_inst = MLModel.get_cls(self.ml_model_name)(self.config_inst)
for config_category in self.config_categories:
cat_inst = self.config_inst.get_category(config_category)
# root_cats = cat_inst.x.root_cats
# lep_channel = root_cats.get("lep")
# if lep_channel not in self.config_inst.x.lepton_channels:
# raise Exception(
# "Each inference category needs to be categorized based on number of leptons; "
# f"Options: {self.config_inst.x.lepton_channels}",
# )

cat_name = self.cat_name(cat_inst)
cat_kwargs = dict(
config_category=config_category,
config_variable=self.config_variable,
mc_stats=self.mc_stats,
)
if self.skip_data:
cat_kwargs["data_from_processes"] = self.processes
# else:
# cat_kwargs["config_data_datasets"] = const.data_datasets[lep_channel]

self.add_category(cat_name, **cat_kwargs)

# do some customization of the inference category
self.customize_category(self.get_category(cat_name), cat_inst)

def add_inference_processes(self: InferenceModel):
"""
Function that adds processes with their corresponding datasets to all categories
of the inference model
"""
used_datasets = set()
for proc in self.processes:
if not self.config_inst.has_process(proc):
raise Exception(f"Process {proc} not included in the config {self.config_inst.name}")

# get datasets corresponding to this process
datasets = [
d.name for d in
get_datasets_from_process(self.config_inst, proc, strategy="inclusive")
]
# hack for dy
if proc == "dy":
datasets = [x for x in datasets if x.startswith("dy_lep_pt")]

# check that no dataset is used multiple times
if datasets_already_used := used_datasets.intersection(datasets):
raise Exception(f"{datasets_already_used} datasets are used for multiple processes")
used_datasets |= set(datasets)

self.add_process(
const.inference_procnames.get(proc, proc),
config_process=proc,
is_signal=(any(x in proc for x in ["HH_", "hh_"])),
config_mc_datasets=datasets,
)

def add_inference_parameters(self: InferenceModel):
"""
Function that adds all parameters (systematic variations) to the inference model
"""
# define the two relevant types of parameter groups
self.add_parameter_group("experiment")
self.add_parameter_group("theory")

# add rate + shape parameters to the inference model
self.add_rate_parameters()
# self.add_shape_parameters()

# TODO: check that all requested systematics are in final MLModel?

def add_rate_parameters(self: InferenceModel):
"""
Function that adds all rate parameters to the inference model
"""
ecm = self.config_inst.campaign.ecm

# lumi
lumi = self.config_inst.x.luminosity
for unc_name in lumi.uncertainties:
if unc_name not in self.systematics:
continue

self.add_parameter(
unc_name,
type=ParameterType.rate_gauss,
effect=lumi.get(names=unc_name, direction=("down", "up"), factor=True),
transformations=[ParameterTransformation.symmetrize],
)
self.add_parameter_to_group(unc_name, "experiment")

# add QCD scale (rate) uncertainties to inference model
# TODO: combine scale and mtop uncertainties for specific processes?
# TODO: some scale/pdf uncertainties should be rounded to 3 digits, others to 4 digits
# NOTE: it might be easier to just take the recommended uncertainty values from HH conventions at
# https://gitlab.cern.ch/hh/naming-conventions instead of taking the values from CMSDB
for k, procs in const.processes_per_QCDScale.items():
syst_name = f"QCDScale_{k}"
if syst_name not in self.systematics:
continue

for proc in procs:
if proc not in self.processes:
continue
process_inst = self.config_inst.get_process(proc)
if "scale" not in process_inst.xsecs[ecm]:
continue
self.add_parameter(
syst_name,
process=const.inference_procnames.get(proc, proc),
type=ParameterType.rate_gauss,
effect=tuple(map(
lambda f: round(f, 3),
process_inst.xsecs[ecm].get(names=("scale"), direction=("down", "up"), factor=True),
)),
)
self.add_parameter_to_group(syst_name, "theory")

# add PDF rate uncertainties to inference model
for k, procs in const.processes_per_pdf_rate.items():
syst_name = f"pdf_{k}"
if syst_name not in self.systematics:
continue

for proc in procs:
if proc not in self.processes:
continue
process_inst = self.config_inst.get_process(proc)
if "pdf" not in process_inst.xsecs[ecm]:
continue

self.add_parameter(
f"pdf_{k}",
process=const.inference_procnames.get(proc, proc),
type=ParameterType.rate_gauss,
effect=tuple(map(
lambda f: round(f, 3),
process_inst.xsecs[ecm].get(names=("pdf"), direction=("down", "up"), factor=True),
)),
)
self.add_parameter_to_group(syst_name, "theory")

113 changes: 113 additions & 0 deletions hbt/inference/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# coding: utf-8

"""
Collection of configurations that stay constant for the analysis
"""

# collection of all signal processes
signals_ggHH = {
# "ggHH_kl_0_kt_1_sl_hbbhww", "ggHH_kl_1_kt_1_sl_hbbhww",
# "ggHH_kl_2p45_kt_1_sl_hbbhww", "ggHH_kl_5_kt_1_sl_hbbhww",
"graviton_hh_ggf_bbtautau_m400"
}
signals_qqHH = {
# "qqHH_CV_1_C2V_1_kl_1_sl_hbbhww", "qqHH_CV_1_C2V_1_kl_0_sl_hbbhww", "qqHH_CV_1_C2V_1_kl_2_sl_hbbhww",
# "qqHH_CV_1_C2V_0_kl_1_sl_hbbhww", "qqHH_CV_1_C2V_2_kl_1_sl_hbbhww",
# "qqHH_CV_0p5_C2V_1_kl_1_sl_hbbhww", "qqHH_CV_1p5_C2V_1_kl_1_sl_hbbhww",
"graviton_hh_vbf_bbtautau_m400",
}
signals = {*signals_ggHH, *signals_qqHH}

# mapping between lepton categories and datasets (only 2017 ATM)
data_datasets = {
"1e": {f"data_e_{i}" for i in ["b", "c", "d", "e", "f"]},
"1mu": {f"data_mu_{i}" for i in ["b", "c", "d", "e", "f"]},
"2e": {"data_e_b"}, # TODO: 2 lep datasets in cmsdb + config
"2mu": {"data_mu_b"}, # TODO
"emu": {"data_mu_b"}, # TODO
}
merged_datasets = set().union(*data_datasets.values())

# mapping between process names in the config and inference model
inference_procnames = {
# key: config process name, value: inference model process name
"foo": "bar",
# "st": "ST",
# "tt": "TT",
}

# mapping, which processes are used for which QCDScale (rate) uncertainty
processes_per_QCDScale = {
"ttbar": ["tt", "st_tchannel", "st_schannel", "st_twchannel", "ttW", "ttZ"],
"V": ["dy_lep", "w_lnu"],
"VV": ["WW", "ZZ", "WZ", "qqZZ"],
"VVV": ["vvv"],
"ggH": ["ggH"],
"qqH": ["qqH"],
"VH": ["ZH", "WH", "VH"],
"ttH": ["ttH", "tHq", "tHW"],
"bbH": ["bbH"], # contains also pdf and alpha_s
# "ggHH": signals_ggHH, # included in inference model (THU_HH)
"qqHH": signals_qqHH,
"VHH": [],
"ttHH": [],
}

# mapping, which processes are used for which pdf (rate) uncertainty
processes_per_pdf_rate = {
"gg": ["tt", "ttZ", "ggZZ"],
"qqbar": ["st_schannel", "st_tchannel", "dy_lep", "w_lnu", "vvv", "qqZZ", "ttW"],
"qg": ["st_twchannel"],
"Higgs_gg": ["ggH"],
"Higgs_qqbar": ["qqH", "ZH", "WH", "VH"],
# "Higgs_qg": [], # none so far
"Higgs_ttH": ["ttH", "tHq", "tHW"],
# "Higgs_bbh": ["bbH"], # removed
"Higgs_ggHH": signals_ggHH,
"Higgs_qqHH": signals_qqHH,
"Higgs_VHH": ["HHZ", "HHW+", "HHW-"],
"Higgs_ttHH": ["ttHH"],
}

# mapping for each shape uncertainty, which process is used.
# If "all" is included, takes all processes except for the ones specified (starting with !)
processes_per_shape = {
"btag_hf": ["all"],
"btag_lf": ["all"],
"btag_hfstats1_2017": ["all"],
"btag_hfstats2_2017": ["all"],
"btag_lfstats1_2017": ["all"],
"btag_lfstats2_2017": ["all"],
"btag_cferr1": ["all"],
"btag_cferr2": ["all"],
"mu_trig": ["all"],
"e_sf": ["all"],
"e_trig": ["all"],
"minbias_xs": ["all"],
"top_pt": ["all"],
"pdf_shape_ggHH_kl_0_kt_1_sl_hbbhww": ["ggHH_kl_0_kt_1_sl_hbbhww"],
"pdf_shape_ggHH_kl_1_kt_1_sl_hbbhww": ["ggHH_kl_1_kt_1_sl_hbbhww"],
"pdf_shape_ggHH_kl_2p45_kt_1_sl_hbbhww": ["ggHH_kl_2p45_kt_1_sl_hbbhww"],
"pdf_shape_ggHH_kl_5_kt_1_sl_hbbhww": ["ggHH_kl_5_kt_1_sl_hbbhww"],
"pdf_shape_tt": ["tt"],
"pdf_shape_st": ["st_schannel", "st_twchannel"], # TODO: there was some bug with "st_tchannel"
"pdf_shape_dy": ["dy_lep"],
"pdf_shape_w": ["w_lnu"],
"murf_envelope_ggHH_kl_0_kt_1_sl_hbbhww": ["ggHH_kl_0_kt_1_sl_hbbhww"],
"murf_envelope_ggHH_kl_1_kt_1_sl_hbbhww": ["ggHH_kl_1_kt_1_sl_hbbhww"],
"murf_envelope_ggHH_kl_2p45_kt_1_sl_hbbhww": ["ggHH_kl_2p45_kt_1_sl_hbbhww"],
"murf_envelope_ggHH_kl_5_kt_1_sl_hbbhww": ["ggHH_kl_5_kt_1_sl_hbbhww"],
"murf_envelope_tt": ["tt"],
"murf_envelope_st": ["st_schannel", "st_tchannel", "st_twchannel"],
"murf_envelope_dy": ["dy_lep"],
"murf_envelope_w": ["w_lnu"],
}

# mapping for each shape uncertainty, which shift source is used
# per default: shape and source have the same name (except for pdf and murf, which are implemented per process)
source_per_shape = {shape: shape for shape in processes_per_shape.keys()}
for shape in processes_per_shape.keys():
if "pdf_shape" in shape:
source_per_shape[shape] = "pdf"
elif "murf_envelope" in shape:
source_per_shape[shape] = "murf_envelope"
Loading

0 comments on commit ad27dbe

Please sign in to comment.