Fix asymmetric PLD creation in DP Accounting

DP Accounting changes: * Fix asymmetric PLD from two probability mass functions * Update tests for PLDAccountant Privacy on Beam: * Fix tolerance in TestMeanPerKeyCrossPartitionContributionBounding Change-Id: I4f7ec279b5eaf41638bf41fafc740fefe7a168c2 GitOrigin-RevId: 5b04c28c6cac908fff04796604804625a33785ac
google · Oct 28, 2024 · 0e99a6f · 0e99a6f
1 parent 9b4401a
commit 0e99a6f
Show file tree

Hide file tree

Showing 5 changed files with 235 additions and 86 deletions.
diff --git a/privacy-on-beam/pbeam/mean_test.go b/privacy-on-beam/pbeam/mean_test.go
@@ -1029,34 +1029,46 @@ func TestMeanKeyNegativeBounds(t *testing.T) {
 func TestMeanPerKeyCrossPartitionContributionBounding(t *testing.T) {
 	var triples []testutils.TripleWithFloatValue
 
+	// id 0 contributes to partition 0 and 1 with value 150.0.
+	// ids [1, 4] each contributes to partition 0 with value 0.0.
+	// ids [5, 8] each contributes to partition 1 with value 0.0.
 	triples = append(triples, testutils.MakeTripleWithFloatValue(1, 0, 150)...)
 	triples = append(triples, testutils.MakeTripleWithFloatValue(1, 1, 150)...)
 
-	triples = append(triples, testutils.MakeTripleWithFloatValueStartingFromKey(1, 50, 0, 0)...)
-	triples = append(triples, testutils.MakeTripleWithFloatValueStartingFromKey(51, 50, 1, 0)...)
+	triples = append(triples, testutils.MakeTripleWithFloatValueStartingFromKey(1, 4, 0, 0)...)
+	triples = append(triples, testutils.MakeTripleWithFloatValueStartingFromKey(5, 4, 1, 0)...)
 
 	// MaxPartitionContributed = 1, but id = 0 contributes to 2 partitions (0 and 1).
 	// There will be cross-partition contribution bounding stage.
-	// In this stage the algorithm will randomly chose either contribution for partition 0 or contribution to partition 1.
-	// The sum of 2 means should be equal to 150/51 + 0/50 = 150/51 ≈ 2.94 in both cases (unlike 150/51 + 150/51 ≈ 5.88, if no cross-partition contribution bounding is done).
-	// The difference between these numbers ≈ 2.94 and the tolerance (see below) is ≈ 0.04, so the test should catch if there was no cross-partition contribution bounding.
-	exactCount := 51.0
-	exactMean := 150.0 / exactCount
+	// In this stage the algorithm will randomly keep either partition 0 or partition 1 for id 0.
+	// The sum of 2 means should be equal to 150/5 + 0/4 = 30 in both cases
+	// (unlike 150/5 + 150/5 = 60, if no cross-partition contribution bounding is done).
+	// The difference between these numbers is 30 (60-30), and the sum of two tolerances (see below)
+	// is ≈ 26.6277 (11.4598 + 15.1679),
+	// so the test should fail if there was no cross-partition contribution bounding.
+	minValue := 0.0
+	maxValue := 150.0
+	midValue := (minValue + maxValue) / 2
+	count2 := 4.0
+	count1 := count2 + 1
+	normalizedSum2 := (0.0 - midValue) * count2
+	normalizedSum1 := normalizedSum2 + (150.0 - midValue)
+	normalizedMean2 := normalizedSum2 / count2
+	normalizedMean1 := normalizedSum1 / count1
+	mean1, mean2 := normalizedMean1+midValue, normalizedMean2+midValue
 	result := []testutils.PairIF64{
-		{Key: 0, Value: exactMean},
+		{Key: 0, Value: mean1 + mean2},
 	}
 	p, s, col, want := ptest.CreateList2(triples, result)
 	col = beam.ParDo(s, testutils.ExtractIDFromTripleWithFloatValue, col)
 
-	// ε=60, δ=0.01 and l0Sensitivity=1 gives a threshold of =2.
+	// ε=10000, δ=0.01 and l0Sensitivity=1 gives a threshold of =2.
 	// We have 2 partitions. So, to get an overall flakiness of 10⁻²³,
 	// we can have each partition fail with 10⁻²⁴ probability (k=24).
 	maxContributionsPerPartition := int64(1)
 	maxPartitionsContributed := int64(1)
-	epsilon := 60.0
+	epsilon := 1e4
 	delta := 0.01
-	minValue := 0.0
-	maxValue := 150.0
 
 	pcol := MakePrivate(s, col, privacySpec(t,
 		PrivacySpecParams{
@@ -1079,20 +1091,21 @@ func TestMeanPerKeyCrossPartitionContributionBounding(t *testing.T) {
 
 	want = beam.ParDo(s, testutils.PairIF64ToKV, want)
 
-	// Tolerance for the partition with an extra contribution which is equal to 150.
+	// Tolerance for the partition with an extra contribution with value 150.0.
 	tolerance1, err := testutils.LaplaceToleranceForMean(
 		24, minValue, maxValue, maxContributionsPerPartition, maxPartitionsContributed,
-		epsilon, -3675.0, 51.0, exactMean) // ≈0.00367
+		epsilon, normalizedSum1, count1, mean1) // ≈15.1679
 	if err != nil {
 		t.Fatalf("LaplaceToleranceForMean: got error %v", err)
 	}
 	// Tolerance for the partition without an extra contribution.
 	tolerance2, err := testutils.LaplaceToleranceForMean(
 		24, minValue, maxValue, maxContributionsPerPartition, maxPartitionsContributed,
-		epsilon, -3700.0, 50.0, 0.0) // ≈1.074
+		epsilon, normalizedSum2, count2, mean2) // ≈1.074
 	if err != nil {
 		t.Fatalf("LaplaceToleranceForMean: got error %v", err)
 	}
+
 	testutils.ApproxEqualsKVFloat64(t, s, got, want, tolerance1+tolerance2)
 	if err := ptest.Run(p); err != nil {
 		t.Errorf("TestMeanPerKeyCrossPartitionContributionBounding: MeanPerKey(%v) = %v, "+
@@ -1308,21 +1321,35 @@ func TestMeanPerKeyWithPartitionsCrossPartitionContributionBounding(t *testing.T
 		{true},
 		{false},
 	} {
+		// id 0 contributes to partition 0 and 1 with value 150.0.
+		// ids [1, 4] each contributes to partition 0 with value 0.0.
+		// ids [5, 8] each contributes to partition 1 with value 0.0.
 		var triples []testutils.TripleWithFloatValue
 		triples = append(triples, testutils.MakeTripleWithFloatValue(1, 0, 150)...)
 		triples = append(triples, testutils.MakeTripleWithFloatValue(1, 1, 150)...)
-		triples = append(triples, testutils.MakeTripleWithFloatValueStartingFromKey(1, 50, 0, 0)...)
-		triples = append(triples, testutils.MakeTripleWithFloatValueStartingFromKey(51, 50, 1, 0)...)
+		triples = append(triples, testutils.MakeTripleWithFloatValueStartingFromKey(1, 4, 0, 0)...)
+		triples = append(triples, testutils.MakeTripleWithFloatValueStartingFromKey(5, 8, 1, 0)...)
 
 		// MaxPartitionContributed = 1, but id = 0 contributes to 2 partitions (0 and 1).
 		// There will be cross-partition contribution bounding stage.
-		// In this stage the algorithm will typically randomly choose either contribution for partition 0 or contribution to partition 1.
-		// The sum of 2 means should be equal to 150/51 + 0/50 = 150/51 ≈ 2.94 in both cases (unlike 150/51 + 150/51 ≈ 5.88, if no cross-partition contribution bounding is done).
-		// The difference between these numbers ≈ 2.94 and the tolerance (see below) is ≈ 0.04, so the test should catch if there was no cross-partition contribution bounding.
-		exactCount := 51.0
-		exactMean := 150.0 / exactCount
+		// In this stage the algorithm the algorithm will randomly keep either partition 0 or partition 1 for id 0.
+		// The sum of 2 means should be equal to 150/5 + 0/4 = 30 in both cases
+		// (unlike 150/5 + 150/5 = 60, if no cross-partition contribution bounding is done).
+		// The difference between these numbers is 30 (60-30), and the sum of two tolerances (see below)
+		// is ≈ 26.6433 (11.4685 + 15.1748),
+		// so the test should fail if there was no cross-partition contribution bounding.
+		minValue := 0.0
+		maxValue := 150.0
+		midValue := (minValue + maxValue) / 2
+		count2 := 4.0
+		count1 := count2 + 1
+		normalizedSum2 := (0.0 - midValue) * count2
+		normalizedSum1 := normalizedSum2 + (150.0 - midValue)
+		normalizedMean2 := normalizedSum2 / count2
+		normalizedMean1 := normalizedSum1 / count1
+		mean1, mean2 := normalizedMean1+midValue, normalizedMean2+midValue
 		result := []testutils.PairIF64{
-			{Key: 0, Value: exactMean},
+			{Key: 0, Value: mean1 + mean2},
 		}
 		publicPartitionsSlice := []int{0, 1}
 
@@ -1338,9 +1365,7 @@ func TestMeanPerKeyWithPartitionsCrossPartitionContributionBounding(t *testing.T
 
 		maxContributionsPerPartition := int64(1)
 		maxPartitionsContributed := int64(1)
-		epsilon := 60.0
-		minValue := 0.0
-		maxValue := 150.0
+		epsilon := 1e4
 
 		// ε is not split, because partitions are public.
 		pcol := MakePrivate(s, col, privacySpec(t, PrivacySpecParams{AggregationEpsilon: epsilon}))
@@ -1364,14 +1389,14 @@ func TestMeanPerKeyWithPartitionsCrossPartitionContributionBounding(t *testing.T
 		// Tolerance for the partition with an extra contribution which is equal to 150.
 		tolerance1, err := testutils.LaplaceToleranceForMean(
 			25, minValue, maxValue, maxContributionsPerPartition, maxPartitionsContributed,
-			epsilon, -3675.0, 51.0, exactMean) // ≈0.00367
+			epsilon, normalizedSum1, count1, mean1) // ≈11.4685
 		if err != nil {
 			t.Fatalf("LaplaceToleranceForMean in-memory=%t: got error %v", tc.inMemory, err)
 		}
 		// Tolerance for the partition without an extra contribution.
 		tolerance2, err := testutils.LaplaceToleranceForMean(
 			25, minValue, maxValue, maxContributionsPerPartition, maxPartitionsContributed,
-			epsilon, -3700.0, 50.0, 0.0) // ≈1.074
+			epsilon, normalizedSum2, count2, mean2) // ≈15.1748
 		if err != nil {
 			t.Fatalf("LaplaceToleranceForMean in-memory=%t: got error %v", tc.inMemory, err)
 		}

diff --git a/python/dp_accounting/dp_accounting/pld/pld_privacy_accountant_test.py b/python/dp_accounting/dp_accounting/pld/pld_privacy_accountant_test.py
@@ -27,7 +27,14 @@ class PldPrivacyAccountantTest(privacy_accountant_test.PrivacyAccountantTest,
                                parameterized.TestCase):
 
   def _make_test_accountants(self):
-    return [pld_privacy_accountant.PLDAccountant()]
+    return [
+        pld_privacy_accountant.PLDAccountant(
+            pld_privacy_accountant.NeighborRel.ADD_OR_REMOVE_ONE),
+        pld_privacy_accountant.PLDAccountant(
+            pld_privacy_accountant.NeighborRel.REPLACE_ONE),
+        pld_privacy_accountant.PLDAccountant(
+            pld_privacy_accountant.NeighborRel.REPLACE_SPECIAL),
+    ]
 
   @parameterized.parameters(
       dp_event.GaussianDpEvent(1.0),

diff --git a/python/dp_accounting/dp_accounting/pld/privacy_loss_distribution.py b/python/dp_accounting/dp_accounting/pld/privacy_loss_distribution.py
@@ -85,13 +85,11 @@ class PrivacyLossDistribution:
   this class associated to various mechanisms.
 
   Attributes:
-    _basic_pld_remove: basic privacy loss distribution with respect to REMOVE
-      adjacency.
-    _basic_pld_add: basic privacy loss distribution with respect to ADD
-      adjacency.
-    _symmetric: When True, basic_pld_add is assumed to be the same as
-      basic_pld_remove.
-    _basic_pld: An alias for basic_pld_remove. Useful when symmetric is True.
+    _pmf_remove: the privacy loss distribution probability mass function with
+      respect to REMOVE adjacency.
+    _pmf_add: the privacy loss distribution probability mass function with
+      respect to ADD adjacency.
+    _symmetric: When True, _pmf_add is assumed to be the same as _pmf_remove.
   """
 
   def __init__(self,
@@ -100,7 +98,7 @@ def __init__(self,
     """Initialization method for PrivacyLossDistribution."""
     self._pmf_remove = pmf_remove
     self._symmetric = pmf_add is None
-    self._pmf_add = pmf_remove if self._symmetric else pmf_add
+    self._pmf_add = pmf_remove if pmf_add is None else pmf_add
 
   @classmethod
   def create_from_rounded_probability(
@@ -718,70 +716,71 @@ def from_two_probability_mass_functions(
   """
 
   def _create_rounded_probability_mass_function(
-      log_probability_mass_function_lower: Mapping[Any, float],
-      log_probability_mass_function_upper: Mapping[Any, float]
+      log_pmf_lower: Mapping[Any, float],
+      log_pmf_upper: Mapping[Any, float]
   ) -> Tuple[float, Mapping[int, float]]:
     """Helper function for creating rounded pmf."""
     infinity_mass = 0
-    for outcome in log_probability_mass_function_upper:
-      if log_probability_mass_function_lower.get(outcome,
-                                                 -math.inf) == -math.inf:
+    for outcome in log_pmf_upper:
+      if log_pmf_lower.get(outcome, -math.inf) == -math.inf:
         # When an outcome only appears in the upper distribution but not in the
         # lower distribution, then it must be counted in infinity_mass as such
         # an outcome contributes to the hockey stick divergence.
-        infinity_mass += math.exp(log_probability_mass_function_upper[outcome])
+        infinity_mass += math.exp(log_pmf_upper[outcome])
     # Compute the (non-discretized) probability mass function for the privacy
     # loss distribution.
-    probability_mass_function = {}
-    for outcome in log_probability_mass_function_lower:
-      if log_probability_mass_function_lower[outcome] == -math.inf:
+    pmf = {}
+    for outcome, log_prob_lower in log_pmf_lower.items():
+      if log_prob_lower == -math.inf:
         # This outcome never occurs in mu_lower. This case was already included
         # as infinity_mass above.
         continue
-      elif (log_probability_mass_function_upper.get(outcome, -math.inf) >
-            log_mass_truncation_bound):
+      elif log_pmf_upper.get(outcome, -math.inf) > log_mass_truncation_bound:
         # When the probability mass of mu_upper at the outcome is greater than
         # the threshold, add it to the distribution.
-        privacy_loss_value = (
-            log_probability_mass_function_upper[outcome] -
-            log_probability_mass_function_lower[outcome])
-        probability_mass_function[privacy_loss_value] = (
-            probability_mass_function.get(privacy_loss_value, 0) +
-            math.exp(log_probability_mass_function_upper[outcome]))
+        privacy_loss_value = log_pmf_upper[outcome] - log_prob_lower
+        pmf[privacy_loss_value] = (
+            pmf.get(privacy_loss_value, 0) + math.exp(log_pmf_upper[outcome])
+        )
       else:
         if pessimistic_estimate:
           # When the probability mass of mu_upper at the outcome is no more than
           # the threshold and we would like to get a pessimistic estimate,
           # account for this in infinity_mass.
-          infinity_mass += math.exp(
-              log_probability_mass_function_upper.get(outcome, -math.inf))
+          infinity_mass += math.exp(log_pmf_upper.get(outcome, -math.inf))
     # Discretize the probability mass so that the values are integer multiples
     # of value_discretization_interval
-    rounded_probability_mass_function = collections.defaultdict(lambda: 0)
+    rounded_pmf = collections.defaultdict(lambda: 0)
     round_fn = math.ceil if pessimistic_estimate else math.floor
-    for val in probability_mass_function:
-      rounded_probability_mass_function[round_fn(
-          val /
-          value_discretization_interval)] += probability_mass_function[val]
-    return infinity_mass, rounded_probability_mass_function
+    for val in pmf:
+      rounded_pmf[round_fn(val / value_discretization_interval)] += pmf[val]
+    return infinity_mass, rounded_pmf
 
-  infinity_mass, rounded_probability_mass_function = _create_rounded_probability_mass_function(
-      log_probability_mass_function_lower, log_probability_mass_function_upper)
+  infinity_mass, rounded_probability_mass_function = (
+      _create_rounded_probability_mass_function(
+          log_pmf_lower=log_probability_mass_function_lower,
+          log_pmf_upper=log_probability_mass_function_upper
+      )
+  )
 
   if symmetric:
     return PrivacyLossDistribution.create_from_rounded_probability(
-        rounded_probability_mass_function,
-        infinity_mass,
-        value_discretization_interval,
-        pessimistic_estimate=pessimistic_estimate)
+        rounded_probability_mass_function, infinity_mass,
+        value_discretization_interval, pessimistic_estimate,
+    )
 
-  infinity_mass_add, rounded_probability_mass_function_add = _create_rounded_probability_mass_function(
-      log_probability_mass_function_lower=log_probability_mass_function_upper,
-      log_probability_mass_function_upper=log_probability_mass_function_lower)
+  infinity_mass_add, rounded_probability_mass_function_add = (
+      _create_rounded_probability_mass_function(
+          log_pmf_lower=log_probability_mass_function_upper,
+          log_pmf_upper=log_probability_mass_function_lower,
+      )
+  )
   return PrivacyLossDistribution.create_from_rounded_probability(
       rounded_probability_mass_function, infinity_mass,
       value_discretization_interval, pessimistic_estimate,
-      rounded_probability_mass_function_add, infinity_mass_add)
+      rounded_probability_mass_function_add, infinity_mass_add,
+      symmetric=False,
+  )
 
 
 def _create_pld_pmf_from_additive_noise(
@@ -846,6 +845,14 @@ def _create_pld_pmf_from_additive_noise(
           if math.ceil(scaled_epsilon) > rounded_epsilons[-1]:
             rounded_epsilons.append(math.ceil(scaled_epsilon))
         rounded_epsilons = np.array(rounded_epsilons)
+
+        deltas = additive_noise_privacy_loss.get_delta_for_epsilon(
+            rounded_epsilons * value_discretization_interval)
+
+        return pld_pmf.create_pmf_pessimistic_connect_dots(
+            value_discretization_interval,
+            rounded_epsilons,
+            deltas)
       else:
         if (connect_dots_bounds.epsilon_upper is None or
             connect_dots_bounds.epsilon_lower is None):
@@ -859,20 +866,15 @@ def _create_pld_pmf_from_additive_noise(
         rounded_epsilons = np.arange(rounded_epsilon_lower,
                                      rounded_epsilon_upper + 1)
 
-      deltas = additive_noise_privacy_loss.get_delta_for_epsilon(
-          rounded_epsilons * value_discretization_interval)
+        deltas = additive_noise_privacy_loss.get_delta_for_epsilon(
+            rounded_epsilons * value_discretization_interval)
 
-      if additive_noise_privacy_loss.discrete_noise:
-        return pld_pmf.create_pmf_pessimistic_connect_dots(
+        # Use a specialized numerically stable approach for continuous noise
+        return pld_pmf.create_pmf_pessimistic_connect_dots_fixed_gap(
             value_discretization_interval,
-            rounded_epsilons,
+            rounded_epsilon_lower,
+            rounded_epsilon_upper,
             deltas)
-      # Else use specialized numerically stable approach for continuous noise
-      return pld_pmf.create_pmf_pessimistic_connect_dots_fixed_gap(
-          value_discretization_interval,
-          rounded_epsilon_lower,
-          rounded_epsilon_upper,
-          deltas)
 
   round_fn = math.ceil if pessimistic_estimate else math.floor
 
@@ -1039,7 +1041,7 @@ def from_randomized_response(
 
   if noise_parameter <= 0 or noise_parameter >= 1:
     raise ValueError(f'Noise parameter must be strictly between 0 and 1: '
-                     f'{noise_parameter}')
+                     f'Found {noise_parameter}')
 
   if num_buckets <= 1:
     raise ValueError(
@@ -1429,3 +1431,4 @@ def from_privacy_parameters(
   return PrivacyLossDistribution.create_from_rounded_probability(
       rounded_probability_mass_function, privacy_parameters.delta,
       value_discretization_interval)
+