Add functionality to combine posteriors via evidence sampling in extract_samples (#5168)

acorreia61201 · web-flow · commit d791555f6892 · 2025-09-30T12:45:06.000-04:00
* add functionality to combine posteriors by sampling points based on evidence values

* add extra changes

* clean up line breaks and docs

* add handles for overlapping priors and mutually exclusive priors

* bugfix evidence writing with one file; alphabetize remapped params

* remove overlapping priors option

* skip saving remapped_params when combining posteriors via sampling

* fix to last commit

* don't avoid remapped_params; try saving only the first remapped_params as long as output names are the same

* brute-force combine remapped_params between multiple files

* update docs

* only save first file's remap metadata if not equal between files
diff --git a/bin/inference/pycbc_inference_extract_samples b/bin/inference/pycbc_inference_extract_samples
@@ -43,6 +43,8 @@ import pycbc
 from pycbc.inference.io import (ResultsArgumentParser, results_from_cli,
                                 PosteriorFile, loadfile)
 from pycbc.inference.io.base_hdf import format_attr
+from scipy.special import logsumexp
+import warnings
 
 
 def isthesame(current_val, val):
@@ -83,6 +85,35 @@ parser.add_argument("--skip-groups", default=None, nargs="+",
                          "to write all groups if only one file is provided, "
                          "and all groups from the first file except "
                          "sampler_info if multiple files are provided.")
+parser.add_argument("--combine-via-sampling", action='store_true', 
+                    default=False,
+                    help="Specify whether to combine the posteriors by " 
+                         "sampling. By default, extract_samples will dump all "
+                         "samples from multiple inputs into one output file. "
+                         "If this option is specified, the samples will "
+                         "be randomly sampled with weighting based on the "
+                         "evidence in each input. For example, if the "
+                         "evidence of input 1 is twice that of input 2, "
+                         "the resulting posterior file with this option "
+                         "specified will have twice as many points from input "
+                         "1 than input 2. The output will have the same "
+                         "number of samples as the smallest input file. "
+                         "An error is thrown if this option is specified and "
+                         "any of the input files does not have a log_evidence "
+                         "attribute (e.g., the file used a sampler like emcee "
+                         "that does not report evidence). This option "
+                         "assumes that the priors of all files do not "
+                         "overlap; we cannot properly combine posteriors if "
+                         "the priors do overlap.")
+parser.add_argument("--mutually-exclusive-priors", action='store_true',
+                    default=False,
+                    help="If specifying --combine-via-sampling, specify "
+                         "whether to treat priors as mutually exclusive. By "
+                         "default, the provided input files are assumed to "
+                         "have identical priors, and their evidences will be "
+                         "averaged. If this option is specified, the priors "
+                         "are assumed to be non-overlapping across files, and "
+                         "their evidences will be summed together.")
 
 opts = parser.parse_args()
 
@@ -101,10 +132,52 @@ if len(opts.input_file) == 1:
 # convert samples to a dict in which the keys are the labels
 # also stack results if multiple files were provided
 if len(opts.input_file) > 1:
-    samples = {labels[p]: numpy.concatenate([s[p] for s in samples])
-               for p in params}
+    if opts.combine_via_sampling:
+        raw_samples = {labels[p]: numpy.concatenate([s[p] for s in samples])
+                   for p in params}
+        logz_list = []
+        dlogz_list = []
+        len_list = []
+        raw_samps_list = []
+        weights_list = []
+        for file in opts.input_file:
+            fp = loadfile(file, 'r')
+            # get evidence from each file if possible
+            try:
+                logz, dlogz = fp.log_evidence
+            except KeyError:
+                raise ValueError(f"Cannot combine evidences; file {file} "
+                                 "does not have a log_evidence attr")
+            logz_list.append(logz)
+            dlogz_list.append(dlogz)
+            # get samples from each file
+            file_samps = fp.read_samples(list(fp['samples'].keys()))
+            raw_samps_list.append(file_samps)
+            # get the number of samples from each file
+            len_list.append(len(file_samps))
+        # compute sampling weights from evidences
+        logz_net = logsumexp(logz_list)
+        len_net = sum(len_list)
+        out_size = min(len_list)
+        for i in range(len(opts.input_file)):
+            # weight each file's samples according to logz
+            logwt = logz_list[i] - logz_net
+            weights_list.append([numpy.exp(logwt)/len_list[i] for j in 
+                                 range(len_list[i])])
+        # randomly sample indices from all samples
+        weights = numpy.concatenate(weights_list)
+        idx = numpy.random.choice(int(len_net), size=out_size, replace=True, 
+                                  p=weights)
+        samples = {param: raw_samples[param][idx] for param in 
+                   raw_samples.keys()}
+    else:
+        samples = {labels[p]: numpy.concatenate([s[p] for s in samples])
+                   for p in params}
 else:
     samples = {labels[p]: samples[p] for p in params}
+    if opts.combine_via_sampling:
+        warnings.warn("Specified combine_via_sampling with only one input "
+                      "file. This option will have no effect.")
 
 # create the file
 outtype = PosteriorFile.name
@@ -137,10 +210,13 @@ for fp in fps:
 skip_attrs = ['filetype', 'thin_start', 'thin_interval', 'thin_end',
               'thinned_by', 'cmd', 'resume_points', 'effective_nsamples',
               'run_start_time', 'run_end_time']
-# also skip evidence if multiple files are being combined, since that will
-# not be the same
+# also skip evidence if multiple files are being combined; this will be handled
+# via sampling if specified
 if len(opts.input_file) > 1:
     skip_attrs += ['log_evidence', 'dlog_evidence']
+    
+# make sure attrs are the same between files...
+cat_params = False
 for fp in fps:
     for key in map(format_attr, fp.attrs):
         if key not in skip_attrs:
@@ -151,12 +227,36 @@ for fp in fps:
                 out.attrs[key] = val
                 current_val = format_attr(out.attrs[key])
             if not isthesame(current_val, val):
-                raise ValueError("cannot combine all files; file attr {} is "
-                                 "not the same across all files ({} vs {})"
-                                 .format(key, current_val, val))
+                if key == 'remapped_params':
+                    # ...unless it's remapped_params; just save first file's
+                    # entries if they don't match between files
+                    warnings.warn("WARNING: remapped_params metadata does not "
+                                 "match between files; saving metadata from "
+                                 "first file")
+                else:
+                    raise ValueError("cannot combine all files; file attr {} is "
+                                     "not the same across all files ({} vs {})"
+                                     .format(key, current_val, val))
 
-# store what parameters were renamed
-out.attrs['remapped_params'] = list(labels.items())
+# store what parameters were renamed (if not already saved above)
+if not cat_params:
+    out.attrs['remapped_params'] = list(labels.items())
+
+# write combined evidence and dlog evidence if combining via sampling
+if opts.combine_via_sampling:
+    if len(opts.input_file) == 1:
+        # there's only one file; just return what's in the input
+        out.attrs['log_evidence'] = fps[0].log_evidence[0]
+        out.attrs['dlog_evidence'] = fps[0].log_evidence[1]
+    elif opts.mutually_exclusive_priors:
+        # add together the evidences; quadrature sum the residuals
+        out.attrs['log_evidence'] = logsumexp(logz_list)
+        out.attrs['dlog_evidence'] = numpy.sqrt(sum([i**2 for i in dlogz_list]))
+    else:
+        # average the evidences; quadrature sum and scale the residuals
+        n = len(opts.input_file)
+        out.attrs['log_evidence'] = logsumexp(logz_list) - numpy.log(n)
+        out.attrs['dlog_evidence'] = numpy.sqrt(sum([i**2 for i in dlogz_list])) / n
 
 # write the other groups using the first file
 fp = fps[0]