Skip to content

Commit 08620cc

Browse files
Merge pull request #322 from johannbrehmer/develop
Fixed unzipping, user-defined binning in FisherInformation
2 parents 8479783 + 005a54b commit 08620cc

File tree

8 files changed

+251
-89
lines changed

8 files changed

+251
-89
lines changed

.travis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ install:
1515
- travis_wait pip install -q --no-cache-dir -e .
1616
script:
1717
- pytest tests/test_imports.py
18-
- pytest -s tests/test_toy_workflow.py
18+
- pytest -s tests/test_ratio_estimation.py
1919
- pytest -s tests/test_nuisance.py
2020
jobs:
2121
include:

examples/tutorial_particle_physics/4b_fisher_information.ipynb

Lines changed: 157 additions & 30 deletions
Large diffs are not rendered by default.

madminer/analysis.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ def __init__(self, filename, disable_morphing=False, include_nuisance_parameters
7777
logger.debug(" %s (%s)", key, values)
7878
else:
7979
logger.info("Did not find nuisance parameters")
80+
self.include_nuisance_parameters = False
8081

8182
logger.info("Found %s benchmarks, of which %s physical", self.n_benchmarks, self.n_benchmarks_phys)
8283
for (key, values), is_nuisance in zip(six.iteritems(self.benchmarks), self.benchmark_is_nuisance):
@@ -109,6 +110,9 @@ def __init__(self, filename, disable_morphing=False, include_nuisance_parameters
109110
self.nuisance_parameters, list(self.benchmarks.keys()), self.reference_benchmark
110111
)
111112
logger.info("Found nuisance morphing setup")
113+
else:
114+
logger.info("Did not find nuisance morphing setup")
115+
self.include_nuisance_parameters = False
112116

113117
def event_loader(self, start=0, end=None, batch_size=100000, include_nuisance_parameters=None):
114118
if include_nuisance_parameters is None:

madminer/fisherinformation.py

Lines changed: 85 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -419,7 +419,7 @@ def calculate_fisher_information_hist1d(
419419
theta,
420420
luminosity,
421421
observable,
422-
nbins,
422+
bins,
423423
histrange=None,
424424
cuts=None,
425425
efficiency_functions=None,
@@ -441,12 +441,14 @@ def calculate_fisher_information_hist1d(
441441
Expression for the observable to be histogrammed. The str will be parsed by Python's `eval()` function
442442
and can use the names of the observables in the MadMiner files.
443443
444-
nbins : int
445-
Number of bins in the histogram, excluding overflow bins.
444+
bins : int or ndarray
445+
If int: number of bins in the histogram, excluding overflow bins. Otherwise, defines the bin boundaries
446+
(excluding overflow bins).
446447
447448
histrange : tuple of float or None, optional
448449
Minimum and maximum value of the histogram in the form `(min, max)`. Overflow bins are always added. If
449-
None, variable-width bins with equal cross section are constructed automatically. Default value: None.
450+
None and bins is an int, variable-width bins with equal cross section are constructed automatically.
451+
Default value: None.
450452
451453
cuts : None or list of str, optional
452454
Cuts. Each entry is a parseable Python expression that returns a bool (True if the event should pass a cut,
@@ -478,19 +480,11 @@ def calculate_fisher_information_hist1d(
478480
cuts = []
479481
if efficiency_functions is None:
480482
efficiency_functions = []
481-
include_nuisance_parameters = include_nuisance_parameters and (self.nuisance_parameters is not None)
482483

483484
# Binning
484-
dynamic_binning = histrange is None
485-
if dynamic_binning:
486-
n_bins_total = nbins
487-
bin_boundaries = self._calculate_dynamic_binning(
488-
observable, theta, nbins, n_events_dynamic_binning, cuts, efficiency_functions
489-
)
490-
logger.debug("Automatic dynamic binning: bin boundaries %s", bin_boundaries)
491-
else:
492-
n_bins_total = nbins + 2
493-
bin_boundaries = np.linspace(histrange[0], histrange[1], num=nbins + 1)
485+
bin_boundaries, n_bins_total = self._calculate_binning(
486+
bins, cuts, efficiency_functions, histrange, n_events_dynamic_binning, observable, theta
487+
)
494488

495489
# Loop over batches
496490
weights_benchmarks = np.zeros((n_bins_total, self.n_benchmarks))
@@ -512,17 +506,20 @@ def calculate_fisher_information_hist1d(
512506
histo_observables = np.asarray([self._eval_observable(obs_event, observable) for obs_event in observations])
513507

514508
# Find bins
515-
bins = np.searchsorted(bin_boundaries, histo_observables)
516-
assert ((0 <= bins) & (bins < n_bins_total)).all(), "Wrong bin {}".format(bins)
509+
i_bins = np.searchsorted(bin_boundaries, histo_observables)
510+
assert ((0 <= i_bins) & (i_bins < n_bins_total)).all(), "Wrong bin {}".format(i_bins)
517511

518512
# Add up
519513
for i in range(n_bins_total):
520-
if len(weights[bins == i]) > 0:
521-
weights_benchmarks[i] += np.sum(weights[bins == i], axis=0)
522-
weights_squared_benchmarks[i] += np.sum(weights[bins == i] ** 2, axis=0)
514+
if len(weights[i_bins == i]) > 0:
515+
weights_benchmarks[i] += np.sum(weights[i_bins == i], axis=0)
516+
weights_squared_benchmarks[i] += np.sum(weights[i_bins == i] ** 2, axis=0)
523517

524518
weights_benchmark_uncertainties = weights_squared_benchmarks ** 0.5
525519

520+
# Check cross sections per bin
521+
self._check_binning_stats(weights_benchmarks, weights_benchmark_uncertainties, theta)
522+
526523
# Calculate Fisher information in histogram
527524
fisher_info, covariance = self._calculate_fisher_information(
528525
theta,
@@ -534,14 +531,55 @@ def calculate_fisher_information_hist1d(
534531
)
535532
return fisher_info, covariance
536533

534+
def _check_binning_stats(
535+
self, weights_benchmarks, weights_benchmark_uncertainties, theta, report=5, n_bins_first_axis=None
536+
):
537+
theta_matrix = self._get_theta_benchmark_matrix(theta, zero_pad=False) # (n_benchmarks_phys,)
538+
sigma = mdot(theta_matrix, weights_benchmarks) # Shape (n_bins,)
539+
sigma_uncertainties = mdot(theta_matrix, weights_benchmark_uncertainties) # Shape (n_bins,)
540+
rel_uncertainties = sigma_uncertainties / np.maximum(sigma, 1.0e-12)
541+
542+
order = np.argsort(rel_uncertainties)[::-1]
543+
544+
logger.info("Bins with largest statistical uncertainties on rates:")
545+
for i_bin in order[:report]:
546+
bin_nd = i_bin + 1
547+
if n_bins_first_axis is not None:
548+
bin_nd = (i_bin // n_bins_first_axis + 1, i_bin % n_bins_first_axis + 1)
549+
logger.info(
550+
" Bin %s: (%.5f +/- %.5f) fb (%.0f %%)",
551+
bin_nd,
552+
1000.0 * sigma[i_bin],
553+
1000.0 * sigma_uncertainties[i_bin],
554+
100.0 * rel_uncertainties[i_bin],
555+
)
556+
557+
def _calculate_binning(
558+
self, bins, cuts, efficiency_functions, histrange, n_events_dynamic_binning, observable, theta
559+
):
560+
dynamic_binning = histrange is None and isinstance(bins, int)
561+
if dynamic_binning:
562+
n_bins_total = bins
563+
bin_boundaries = self._calculate_dynamic_binning(
564+
observable, theta, bins, n_events_dynamic_binning, cuts, efficiency_functions
565+
)
566+
logger.debug("Automatic dynamic binning: bin boundaries %s", bin_boundaries)
567+
elif isinstance(bins, int):
568+
n_bins_total = bins + 2
569+
bin_boundaries = np.linspace(histrange[0], histrange[1], num=bins + 1)
570+
else:
571+
bin_boundaries = bins
572+
n_bins_total = len(bins) + 1
573+
return bin_boundaries, n_bins_total
574+
537575
def calculate_fisher_information_hist2d(
538576
self,
539577
theta,
540578
luminosity,
541579
observable1,
542-
nbins1,
580+
bins1,
543581
observable2,
544-
nbins2,
582+
bins2,
545583
histrange1=None,
546584
histrange2=None,
547585
cuts=None,
@@ -565,15 +603,17 @@ def calculate_fisher_information_hist2d(
565603
Expression for the first observable to be histogrammed. The str will be parsed by Python's `eval()` function
566604
and can use the names of the observables in the MadMiner files.
567605
568-
nbins1 : int
569-
Number of bins along the first axis in the histogram, excluding overflow bins.
606+
bins1 : int or ndarray
607+
If int: number of bins along the first axis in the histogram in the histogram, excluding overflow bins.
608+
Otherwise, defines the bin boundaries along the first axis in the histogram (excluding overflow bins).
570609
571610
observable2 : str
572611
Expression for the first observable to be histogrammed. The str will be parsed by Python's `eval()` function
573612
and can use the names of the observables in the MadMiner files.
574613
575-
nbins2 : int
576-
Number of bins along the first axis in the histogram, excluding overflow bins.
614+
bins2 : int or ndarray
615+
If int: number of bins along the second axis in the histogram in the histogram, excluding overflow bins.
616+
Otherwise, defines the bin boundaries along the second axis in the histogram (excluding overflow bins).
577617
578618
histrange1 : tuple of float or None, optional
579619
Minimum and maximum value of the first axis of the histogram in the form `(min, max)`. Overflow bins are
@@ -617,27 +657,12 @@ def calculate_fisher_information_hist2d(
617657
efficiency_functions = []
618658

619659
# Binning
620-
dynamic_binning1 = histrange1 is None
621-
if dynamic_binning1:
622-
n_bins1_total = nbins1
623-
bin1_boundaries = self._calculate_dynamic_binning(
624-
observable1, theta, nbins1, n_events_dynamic_binning, cuts, efficiency_functions
625-
)
626-
logger.debug("Automatic dynamic binning for observable 1: bin boundaries %s", bin1_boundaries)
627-
else:
628-
n_bins1_total = nbins1 + 2
629-
bin1_boundaries = np.linspace(histrange1[0], histrange1[1], num=nbins1 + 1)
630-
631-
dynamic_binning2 = histrange2 is None
632-
if dynamic_binning2:
633-
n_bins2_total = nbins2
634-
bin2_boundaries = self._calculate_dynamic_binning(
635-
observable2, theta, nbins2, n_events_dynamic_binning, cuts, efficiency_functions
636-
)
637-
logger.debug("Automatic dynamic binning for observable 2: bin boundaries %s", bin2_boundaries)
638-
else:
639-
n_bins2_total = nbins1 + 2
640-
bin2_boundaries = np.linspace(histrange2[0], histrange2[1], num=nbins2 + 1)
660+
bin1_boundaries, n_bins1_total = self._calculate_binning(
661+
bins1, cuts, efficiency_functions, histrange1, n_events_dynamic_binning, observable1, theta
662+
)
663+
bin2_boundaries, n_bins2_total = self._calculate_binning(
664+
bins2, cuts, efficiency_functions, histrange2, n_events_dynamic_binning, observable2, theta
665+
)
641666

642667
# Loop over batches
643668
weights_benchmarks = np.zeros((n_bins1_total, n_bins2_total, self.n_benchmarks))
@@ -664,25 +689,31 @@ def calculate_fisher_information_hist2d(
664689
)
665690

666691
# Find bins
667-
bins1 = np.searchsorted(bin1_boundaries, histo1_observables)
668-
bins2 = np.searchsorted(bin2_boundaries, histo2_observables)
692+
i_bins1 = np.searchsorted(bin1_boundaries, histo1_observables)
693+
i_bins2 = np.searchsorted(bin2_boundaries, histo2_observables)
669694

670-
assert ((0 <= bins1) & (bins1 < n_bins1_total)).all(), "Wrong bin {}".format(bins1)
671-
assert ((0 <= bins1) & (bins1 < n_bins1_total)).all(), "Wrong bin {}".format(bins1)
695+
assert ((0 <= i_bins1) & (i_bins1 < n_bins1_total)).all(), "Wrong bin {}".format(i_bins1)
696+
assert ((0 <= i_bins2) & (i_bins2 < n_bins1_total)).all(), "Wrong bin {}".format(i_bins2)
672697

673698
# Add up
674699
for i in range(n_bins1_total):
675700
for j in range(n_bins2_total):
676-
if len(weights[(bins1 == i) & (bins2 == j)]) > 0:
677-
weights_benchmarks[i, j] += np.sum(weights[(bins1 == i) & (bins2 == j)], axis=0)
678-
weights_squared_benchmarks[i, j] += np.sum(weights[(bins1 == i) & (bins2 == j)] ** 2, axis=0)
701+
if len(weights[(i_bins1 == i) & (i_bins2 == j)]) > 0:
702+
weights_benchmarks[i, j] += np.sum(weights[(i_bins1 == i) & (i_bins2 == j)], axis=0)
703+
weights_squared_benchmarks[i, j] += np.sum(
704+
weights[(i_bins1 == i) & (i_bins2 == j)] ** 2, axis=0
705+
)
679706

680707
weights_benchmark_uncertainties = weights_squared_benchmarks ** 0.5
681708

682709
# Calculate Fisher information in histogram
683710
weights_benchmarks = weights_benchmarks.reshape(-1, self.n_benchmarks)
684711
weights_benchmark_uncertainties = weights_benchmark_uncertainties.reshape(-1, self.n_benchmarks)
685712

713+
self._check_binning_stats(
714+
weights_benchmarks, weights_benchmark_uncertainties, theta, n_bins_first_axis=n_bins1_total
715+
)
716+
686717
fisher_info, covariance = self._calculate_fisher_information(
687718
theta,
688719
weights_benchmarks,

madminer/utils/interfaces/delphes.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def run_delphes(
2626
if extension == ".gz":
2727
logger.debug("Unzipping %s", hepmc_sample_filename)
2828
if not os.path.exists(filename):
29-
call_command("gunzip -k {}".format(hepmc_sample_filename))
29+
call_command("gunzip -c {} > {}".format(hepmc_sample_filename, filename))
3030
if delete_unzipped_file:
3131
to_delete = filename
3232
hepmc_sample_filename = filename

madminer/utils/interfaces/hepmc.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ def extract_weight_order(filename, default_weight_label=None):
1313
new_filename, extension = os.path.splitext(filename)
1414
if extension == ".gz":
1515
if not os.path.exists(new_filename):
16-
call_command("gunzip -k {}".format(filename))
16+
call_command("gunzip -c {} > {}".format(filename, new_filename))
1717
filename = new_filename
1818

1919
with open(filename, encoding="latin-1") as file:

madminer/utils/interfaces/lhe.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -853,7 +853,7 @@ def _untar_and_parse_lhe_file(filename):
853853
new_filename, extension = os.path.splitext(filename)
854854
if extension == ".gz":
855855
if not os.path.exists(new_filename):
856-
call_command("gunzip -k {}".format(filename))
856+
call_command("gunzip -c {} > {}".format(filename, new_filename))
857857
filename = new_filename
858858

859859
# In some cases, the LHE comments can contain bad characters

tests/test_toy_workflow.py renamed to tests/test_ratio_estimation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ def run_test():
119119
return rmse
120120

121121

122-
def test_toy_workflow():
122+
def test_ratio_estimation():
123123
rmse = run_test()
124124

125125
print("Root mean squared error of true log r vs ALICES log r: {}".format(rmse))

0 commit comments

Comments
 (0)