40
40
# %% [markdown]
41
41
# ### Imports: setting up our environment
42
42
43
- # %%
43
+ # %% tags=[]
44
44
import asyncio
45
45
import logging
46
46
import os
89
89
#
90
90
# The input files are all in the 1–3 GB range.
91
91
92
- # %%
92
+ # %% tags=[]
93
93
### GLOBAL CONFIGURATION
94
94
# input files per process, set to e.g. 10 (smaller number = faster)
95
95
N_FILES_MAX_PER_SAMPLE = 5
114
114
# - calculating systematic uncertainties at the event and object level,
115
115
# - filling all the information into histograms that get aggregated and ultimately returned to us by `coffea`.
116
116
117
- # %%
117
+ # %% tags=[]
118
118
# functions creating systematic variations
119
119
def flat_variation (ones ):
120
120
# 2.5% weight variations
@@ -316,7 +316,7 @@ def postprocess(self, accumulator):
316
316
#
317
317
# Here, we gather all the required information about the files we want to process: paths to the files and asociated metadata.
318
318
319
- # %%
319
+ # %% tags=[]
320
320
fileset = utils .construct_fileset (N_FILES_MAX_PER_SAMPLE , use_xcache = False , af_name = config ["benchmarking" ]["AF_NAME" ]) # local files on /data for ssl-dev
321
321
322
322
print (f"processes in fileset: { list (fileset .keys ())} " )
@@ -329,7 +329,7 @@ def postprocess(self, accumulator):
329
329
#
330
330
# Define the func_adl query to be used for the purpose of extracting columns and filtering.
331
331
332
- # %%
332
+ # %% tags=[]
333
333
def get_query (source : ObjectStream ) -> ObjectStream :
334
334
"""Query for event / column selection: >=4j >=1b, ==1 lep with pT>25 GeV, return relevant columns
335
335
"""
@@ -355,7 +355,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
355
355
#
356
356
# Using the queries created with `func_adl`, we are using `ServiceX` to read the CMS Open Data files to build cached files with only the specific event information as dictated by the query.
357
357
358
- # %%
358
+ # %% tags=[]
359
359
if USE_SERVICEX :
360
360
# dummy dataset on which to generate the query
361
361
dummy_ds = ServiceXSourceUpROOT ("cernopendata://dummy" , "Events" , backend_name = "uproot" )
@@ -385,7 +385,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
385
385
#
386
386
# When `USE_SERVICEX` is false, the input files need to be processed during this step as well.
387
387
388
- # %%
388
+ # %% tags=[]
389
389
NanoAODSchema .warn_missing_crossrefs = False # silences warnings about branches we will not use here
390
390
if USE_DASK :
391
391
executor = processor .DaskExecutor (client = utils .get_client (af = config ["global" ]["AF" ]))
@@ -410,7 +410,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
410
410
411
411
print (f"\n execution took { exec_time :.2f} seconds" )
412
412
413
- # %%
413
+ # %% tags=[]
414
414
# track metrics
415
415
dataset_source = "/data" if fileset ["ttbar__nominal" ]["files" ][0 ].startswith ("/data" ) else "https://xrootd-local.unl.edu:1094" # TODO: xcache support
416
416
metrics .update ({
@@ -448,15 +448,15 @@ def get_query(source: ObjectStream) -> ObjectStream:
448
448
# Let's have a look at the data we obtained.
449
449
# We built histograms in two phase space regions, for multiple physics processes and systematic variations.
450
450
451
- # %%
451
+ # %% tags=[]
452
452
utils .set_style ()
453
453
454
454
all_histograms [120j ::hist .rebin (2 ), "4j1b" , :, "nominal" ].stack ("process" )[::- 1 ].plot (stack = True , histtype = "fill" , linewidth = 1 , edgecolor = "grey" )
455
455
plt .legend (frameon = False )
456
456
plt .title (">= 4 jets, 1 b-tag" )
457
457
plt .xlabel ("HT [GeV]" );
458
458
459
- # %%
459
+ # %% tags=[]
460
460
all_histograms [:, "4j2b" , :, "nominal" ].stack ("process" )[::- 1 ].plot (stack = True , histtype = "fill" , linewidth = 1 ,edgecolor = "grey" )
461
461
plt .legend (frameon = False )
462
462
plt .title (">= 4 jets, >= 2 b-tags" )
@@ -471,7 +471,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
471
471
#
472
472
# We are making of [UHI](https://uhi.readthedocs.io/) here to re-bin.
473
473
474
- # %%
474
+ # %% tags=[]
475
475
# b-tagging variations
476
476
all_histograms [120j ::hist .rebin (2 ), "4j1b" , "ttbar" , "nominal" ].plot (label = "nominal" , linewidth = 2 )
477
477
all_histograms [120j ::hist .rebin (2 ), "4j1b" , "ttbar" , "btag_var_0_up" ].plot (label = "NP 1" , linewidth = 2 )
@@ -482,7 +482,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
482
482
plt .xlabel ("HT [GeV]" )
483
483
plt .title ("b-tagging variations" );
484
484
485
- # %%
485
+ # %% tags=[]
486
486
# jet energy scale variations
487
487
all_histograms [:, "4j2b" , "ttbar" , "nominal" ].plot (label = "nominal" , linewidth = 2 )
488
488
all_histograms [:, "4j2b" , "ttbar" , "pt_scale_up" ].plot (label = "scale up" , linewidth = 2 )
@@ -497,7 +497,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
497
497
# We'll save everything to disk for subsequent usage.
498
498
# This also builds pseudo-data by combining events from the various simulation setups we have processed.
499
499
500
- # %%
500
+ # %% tags=[]
501
501
utils .save_histograms (all_histograms , fileset , "histograms.root" )
502
502
503
503
# %% [markdown]
@@ -510,7 +510,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
510
510
# A statistical model has been defined in `config.yml`, ready to be used with our output.
511
511
# We will use `cabinetry` to combine all histograms into a `pyhf` workspace and fit the resulting statistical model to the pseudodata we built.
512
512
513
- # %%
513
+ # %% tags=[]
514
514
config = cabinetry .configuration .load ("cabinetry_config.yml" )
515
515
516
516
# rebinning: lower edge 110 GeV, merge bins 2->1
@@ -523,13 +523,13 @@ def get_query(source: ObjectStream) -> ObjectStream:
523
523
# %% [markdown]
524
524
# We can inspect the workspace with `pyhf`, or use `pyhf` to perform inference.
525
525
526
- # %%
526
+ # %% tags=[]
527
527
# !pyhf inspect workspace.json | head -n 20
528
528
529
529
# %% [markdown]
530
530
# Let's try out what we built: the next cell will perform a maximum likelihood fit of our statistical model to the pseudodata we built.
531
531
532
- # %%
532
+ # %% tags=[]
533
533
model , data = cabinetry .model_utils .model_and_data (ws )
534
534
fit_results = cabinetry .fit .fit (model , data )
535
535
@@ -540,31 +540,31 @@ def get_query(source: ObjectStream) -> ObjectStream:
540
540
# %% [markdown]
541
541
# For this pseudodata, what is the resulting ttbar cross-section divided by the Standard Model prediction?
542
542
543
- # %%
543
+ # %% tags=[]
544
544
poi_index = model .config .poi_index
545
545
print (f"\n fit result for ttbar_norm: { fit_results .bestfit [poi_index ]:.3f} +/- { fit_results .uncertainty [poi_index ]:.3f} " )
546
546
547
547
# %% [markdown]
548
548
# Let's also visualize the model before and after the fit, in both the regions we are using.
549
549
# The binning here corresponds to the binning used for the fit.
550
550
551
- # %%
551
+ # %% tags=[]
552
552
model_prediction = cabinetry .model_utils .prediction (model )
553
553
figs = cabinetry .visualize .data_mc (model_prediction , data , close_figure = True , config = config )
554
554
figs [0 ]["figure" ]
555
555
556
- # %%
556
+ # %% tags=[]
557
557
figs [1 ]["figure" ]
558
558
559
559
# %% [markdown]
560
560
# We can see very good post-fit agreement.
561
561
562
- # %%
562
+ # %% tags=[]
563
563
model_prediction_postfit = cabinetry .model_utils .prediction (model , fit_results = fit_results )
564
564
figs = cabinetry .visualize .data_mc (model_prediction_postfit , data , close_figure = True , config = config )
565
565
figs [0 ]["figure" ]
566
566
567
- # %%
567
+ # %% tags=[]
568
568
figs [1 ]["figure" ]
569
569
570
570
# %% [markdown]
0 commit comments