Skip to content

Commit

Permalink
Fix asymmetric PLD creation in DP Accounting
Browse files Browse the repository at this point in the history
DP Accounting changes:
* Fix asymmetric PLD from two probability mass functions
* Update tests for PLDAccountant

Privacy on Beam:
* Fix tolerance in TestMeanPerKeyCrossPartitionContributionBounding

Change-Id: I4f7ec279b5eaf41638bf41fafc740fefe7a168c2
GitOrigin-RevId: 5b04c28c6cac908fff04796604804625a33785ac
  • Loading branch information
Differential Privacy Team authored and dibakch committed Oct 28, 2024
1 parent 9b4401a commit 0e99a6f
Show file tree
Hide file tree
Showing 5 changed files with 235 additions and 86 deletions.
81 changes: 53 additions & 28 deletions privacy-on-beam/pbeam/mean_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1029,34 +1029,46 @@ func TestMeanKeyNegativeBounds(t *testing.T) {
func TestMeanPerKeyCrossPartitionContributionBounding(t *testing.T) {
var triples []testutils.TripleWithFloatValue

// id 0 contributes to partition 0 and 1 with value 150.0.
// ids [1, 4] each contributes to partition 0 with value 0.0.
// ids [5, 8] each contributes to partition 1 with value 0.0.
triples = append(triples, testutils.MakeTripleWithFloatValue(1, 0, 150)...)
triples = append(triples, testutils.MakeTripleWithFloatValue(1, 1, 150)...)

triples = append(triples, testutils.MakeTripleWithFloatValueStartingFromKey(1, 50, 0, 0)...)
triples = append(triples, testutils.MakeTripleWithFloatValueStartingFromKey(51, 50, 1, 0)...)
triples = append(triples, testutils.MakeTripleWithFloatValueStartingFromKey(1, 4, 0, 0)...)
triples = append(triples, testutils.MakeTripleWithFloatValueStartingFromKey(5, 4, 1, 0)...)

// MaxPartitionContributed = 1, but id = 0 contributes to 2 partitions (0 and 1).
// There will be cross-partition contribution bounding stage.
// In this stage the algorithm will randomly chose either contribution for partition 0 or contribution to partition 1.
// The sum of 2 means should be equal to 150/51 + 0/50 = 150/51 ≈ 2.94 in both cases (unlike 150/51 + 150/51 ≈ 5.88, if no cross-partition contribution bounding is done).
// The difference between these numbers ≈ 2.94 and the tolerance (see below) is ≈ 0.04, so the test should catch if there was no cross-partition contribution bounding.
exactCount := 51.0
exactMean := 150.0 / exactCount
// In this stage the algorithm will randomly keep either partition 0 or partition 1 for id 0.
// The sum of 2 means should be equal to 150/5 + 0/4 = 30 in both cases
// (unlike 150/5 + 150/5 = 60, if no cross-partition contribution bounding is done).
// The difference between these numbers is 30 (60-30), and the sum of two tolerances (see below)
// is ≈ 26.6277 (11.4598 + 15.1679),
// so the test should fail if there was no cross-partition contribution bounding.
minValue := 0.0
maxValue := 150.0
midValue := (minValue + maxValue) / 2
count2 := 4.0
count1 := count2 + 1
normalizedSum2 := (0.0 - midValue) * count2
normalizedSum1 := normalizedSum2 + (150.0 - midValue)
normalizedMean2 := normalizedSum2 / count2
normalizedMean1 := normalizedSum1 / count1
mean1, mean2 := normalizedMean1+midValue, normalizedMean2+midValue
result := []testutils.PairIF64{
{Key: 0, Value: exactMean},
{Key: 0, Value: mean1 + mean2},
}
p, s, col, want := ptest.CreateList2(triples, result)
col = beam.ParDo(s, testutils.ExtractIDFromTripleWithFloatValue, col)

// ε=60, δ=0.01 and l0Sensitivity=1 gives a threshold of =2.
// ε=10000, δ=0.01 and l0Sensitivity=1 gives a threshold of =2.
// We have 2 partitions. So, to get an overall flakiness of 10⁻²³,
// we can have each partition fail with 10⁻²⁴ probability (k=24).
maxContributionsPerPartition := int64(1)
maxPartitionsContributed := int64(1)
epsilon := 60.0
epsilon := 1e4
delta := 0.01
minValue := 0.0
maxValue := 150.0

pcol := MakePrivate(s, col, privacySpec(t,
PrivacySpecParams{
Expand All @@ -1079,20 +1091,21 @@ func TestMeanPerKeyCrossPartitionContributionBounding(t *testing.T) {

want = beam.ParDo(s, testutils.PairIF64ToKV, want)

// Tolerance for the partition with an extra contribution which is equal to 150.
// Tolerance for the partition with an extra contribution with value 150.0.
tolerance1, err := testutils.LaplaceToleranceForMean(
24, minValue, maxValue, maxContributionsPerPartition, maxPartitionsContributed,
epsilon, -3675.0, 51.0, exactMean) // ≈0.00367
epsilon, normalizedSum1, count1, mean1) // ≈15.1679
if err != nil {
t.Fatalf("LaplaceToleranceForMean: got error %v", err)
}
// Tolerance for the partition without an extra contribution.
tolerance2, err := testutils.LaplaceToleranceForMean(
24, minValue, maxValue, maxContributionsPerPartition, maxPartitionsContributed,
epsilon, -3700.0, 50.0, 0.0) // ≈1.074
epsilon, normalizedSum2, count2, mean2) // ≈1.074
if err != nil {
t.Fatalf("LaplaceToleranceForMean: got error %v", err)
}

testutils.ApproxEqualsKVFloat64(t, s, got, want, tolerance1+tolerance2)
if err := ptest.Run(p); err != nil {
t.Errorf("TestMeanPerKeyCrossPartitionContributionBounding: MeanPerKey(%v) = %v, "+
Expand Down Expand Up @@ -1308,21 +1321,35 @@ func TestMeanPerKeyWithPartitionsCrossPartitionContributionBounding(t *testing.T
{true},
{false},
} {
// id 0 contributes to partition 0 and 1 with value 150.0.
// ids [1, 4] each contributes to partition 0 with value 0.0.
// ids [5, 8] each contributes to partition 1 with value 0.0.
var triples []testutils.TripleWithFloatValue
triples = append(triples, testutils.MakeTripleWithFloatValue(1, 0, 150)...)
triples = append(triples, testutils.MakeTripleWithFloatValue(1, 1, 150)...)
triples = append(triples, testutils.MakeTripleWithFloatValueStartingFromKey(1, 50, 0, 0)...)
triples = append(triples, testutils.MakeTripleWithFloatValueStartingFromKey(51, 50, 1, 0)...)
triples = append(triples, testutils.MakeTripleWithFloatValueStartingFromKey(1, 4, 0, 0)...)
triples = append(triples, testutils.MakeTripleWithFloatValueStartingFromKey(5, 8, 1, 0)...)

// MaxPartitionContributed = 1, but id = 0 contributes to 2 partitions (0 and 1).
// There will be cross-partition contribution bounding stage.
// In this stage the algorithm will typically randomly choose either contribution for partition 0 or contribution to partition 1.
// The sum of 2 means should be equal to 150/51 + 0/50 = 150/51 ≈ 2.94 in both cases (unlike 150/51 + 150/51 ≈ 5.88, if no cross-partition contribution bounding is done).
// The difference between these numbers ≈ 2.94 and the tolerance (see below) is ≈ 0.04, so the test should catch if there was no cross-partition contribution bounding.
exactCount := 51.0
exactMean := 150.0 / exactCount
// In this stage the algorithm the algorithm will randomly keep either partition 0 or partition 1 for id 0.
// The sum of 2 means should be equal to 150/5 + 0/4 = 30 in both cases
// (unlike 150/5 + 150/5 = 60, if no cross-partition contribution bounding is done).
// The difference between these numbers is 30 (60-30), and the sum of two tolerances (see below)
// is ≈ 26.6433 (11.4685 + 15.1748),
// so the test should fail if there was no cross-partition contribution bounding.
minValue := 0.0
maxValue := 150.0
midValue := (minValue + maxValue) / 2
count2 := 4.0
count1 := count2 + 1
normalizedSum2 := (0.0 - midValue) * count2
normalizedSum1 := normalizedSum2 + (150.0 - midValue)
normalizedMean2 := normalizedSum2 / count2
normalizedMean1 := normalizedSum1 / count1
mean1, mean2 := normalizedMean1+midValue, normalizedMean2+midValue
result := []testutils.PairIF64{
{Key: 0, Value: exactMean},
{Key: 0, Value: mean1 + mean2},
}
publicPartitionsSlice := []int{0, 1}

Expand All @@ -1338,9 +1365,7 @@ func TestMeanPerKeyWithPartitionsCrossPartitionContributionBounding(t *testing.T

maxContributionsPerPartition := int64(1)
maxPartitionsContributed := int64(1)
epsilon := 60.0
minValue := 0.0
maxValue := 150.0
epsilon := 1e4

// ε is not split, because partitions are public.
pcol := MakePrivate(s, col, privacySpec(t, PrivacySpecParams{AggregationEpsilon: epsilon}))
Expand All @@ -1364,14 +1389,14 @@ func TestMeanPerKeyWithPartitionsCrossPartitionContributionBounding(t *testing.T
// Tolerance for the partition with an extra contribution which is equal to 150.
tolerance1, err := testutils.LaplaceToleranceForMean(
25, minValue, maxValue, maxContributionsPerPartition, maxPartitionsContributed,
epsilon, -3675.0, 51.0, exactMean) // ≈0.00367
epsilon, normalizedSum1, count1, mean1) // ≈11.4685
if err != nil {
t.Fatalf("LaplaceToleranceForMean in-memory=%t: got error %v", tc.inMemory, err)
}
// Tolerance for the partition without an extra contribution.
tolerance2, err := testutils.LaplaceToleranceForMean(
25, minValue, maxValue, maxContributionsPerPartition, maxPartitionsContributed,
epsilon, -3700.0, 50.0, 0.0) // ≈1.074
epsilon, normalizedSum2, count2, mean2) // ≈15.1748
if err != nil {
t.Fatalf("LaplaceToleranceForMean in-memory=%t: got error %v", tc.inMemory, err)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,14 @@ class PldPrivacyAccountantTest(privacy_accountant_test.PrivacyAccountantTest,
parameterized.TestCase):

def _make_test_accountants(self):
return [pld_privacy_accountant.PLDAccountant()]
return [
pld_privacy_accountant.PLDAccountant(
pld_privacy_accountant.NeighborRel.ADD_OR_REMOVE_ONE),
pld_privacy_accountant.PLDAccountant(
pld_privacy_accountant.NeighborRel.REPLACE_ONE),
pld_privacy_accountant.PLDAccountant(
pld_privacy_accountant.NeighborRel.REPLACE_SPECIAL),
]

@parameterized.parameters(
dp_event.GaussianDpEvent(1.0),
Expand Down
113 changes: 58 additions & 55 deletions python/dp_accounting/dp_accounting/pld/privacy_loss_distribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,13 +85,11 @@ class PrivacyLossDistribution:
this class associated to various mechanisms.
Attributes:
_basic_pld_remove: basic privacy loss distribution with respect to REMOVE
adjacency.
_basic_pld_add: basic privacy loss distribution with respect to ADD
adjacency.
_symmetric: When True, basic_pld_add is assumed to be the same as
basic_pld_remove.
_basic_pld: An alias for basic_pld_remove. Useful when symmetric is True.
_pmf_remove: the privacy loss distribution probability mass function with
respect to REMOVE adjacency.
_pmf_add: the privacy loss distribution probability mass function with
respect to ADD adjacency.
_symmetric: When True, _pmf_add is assumed to be the same as _pmf_remove.
"""

def __init__(self,
Expand All @@ -100,7 +98,7 @@ def __init__(self,
"""Initialization method for PrivacyLossDistribution."""
self._pmf_remove = pmf_remove
self._symmetric = pmf_add is None
self._pmf_add = pmf_remove if self._symmetric else pmf_add
self._pmf_add = pmf_remove if pmf_add is None else pmf_add

@classmethod
def create_from_rounded_probability(
Expand Down Expand Up @@ -718,70 +716,71 @@ def from_two_probability_mass_functions(
"""

def _create_rounded_probability_mass_function(
log_probability_mass_function_lower: Mapping[Any, float],
log_probability_mass_function_upper: Mapping[Any, float]
log_pmf_lower: Mapping[Any, float],
log_pmf_upper: Mapping[Any, float]
) -> Tuple[float, Mapping[int, float]]:
"""Helper function for creating rounded pmf."""
infinity_mass = 0
for outcome in log_probability_mass_function_upper:
if log_probability_mass_function_lower.get(outcome,
-math.inf) == -math.inf:
for outcome in log_pmf_upper:
if log_pmf_lower.get(outcome, -math.inf) == -math.inf:
# When an outcome only appears in the upper distribution but not in the
# lower distribution, then it must be counted in infinity_mass as such
# an outcome contributes to the hockey stick divergence.
infinity_mass += math.exp(log_probability_mass_function_upper[outcome])
infinity_mass += math.exp(log_pmf_upper[outcome])
# Compute the (non-discretized) probability mass function for the privacy
# loss distribution.
probability_mass_function = {}
for outcome in log_probability_mass_function_lower:
if log_probability_mass_function_lower[outcome] == -math.inf:
pmf = {}
for outcome, log_prob_lower in log_pmf_lower.items():
if log_prob_lower == -math.inf:
# This outcome never occurs in mu_lower. This case was already included
# as infinity_mass above.
continue
elif (log_probability_mass_function_upper.get(outcome, -math.inf) >
log_mass_truncation_bound):
elif log_pmf_upper.get(outcome, -math.inf) > log_mass_truncation_bound:
# When the probability mass of mu_upper at the outcome is greater than
# the threshold, add it to the distribution.
privacy_loss_value = (
log_probability_mass_function_upper[outcome] -
log_probability_mass_function_lower[outcome])
probability_mass_function[privacy_loss_value] = (
probability_mass_function.get(privacy_loss_value, 0) +
math.exp(log_probability_mass_function_upper[outcome]))
privacy_loss_value = log_pmf_upper[outcome] - log_prob_lower
pmf[privacy_loss_value] = (
pmf.get(privacy_loss_value, 0) + math.exp(log_pmf_upper[outcome])
)
else:
if pessimistic_estimate:
# When the probability mass of mu_upper at the outcome is no more than
# the threshold and we would like to get a pessimistic estimate,
# account for this in infinity_mass.
infinity_mass += math.exp(
log_probability_mass_function_upper.get(outcome, -math.inf))
infinity_mass += math.exp(log_pmf_upper.get(outcome, -math.inf))
# Discretize the probability mass so that the values are integer multiples
# of value_discretization_interval
rounded_probability_mass_function = collections.defaultdict(lambda: 0)
rounded_pmf = collections.defaultdict(lambda: 0)
round_fn = math.ceil if pessimistic_estimate else math.floor
for val in probability_mass_function:
rounded_probability_mass_function[round_fn(
val /
value_discretization_interval)] += probability_mass_function[val]
return infinity_mass, rounded_probability_mass_function
for val in pmf:
rounded_pmf[round_fn(val / value_discretization_interval)] += pmf[val]
return infinity_mass, rounded_pmf

infinity_mass, rounded_probability_mass_function = _create_rounded_probability_mass_function(
log_probability_mass_function_lower, log_probability_mass_function_upper)
infinity_mass, rounded_probability_mass_function = (
_create_rounded_probability_mass_function(
log_pmf_lower=log_probability_mass_function_lower,
log_pmf_upper=log_probability_mass_function_upper
)
)

if symmetric:
return PrivacyLossDistribution.create_from_rounded_probability(
rounded_probability_mass_function,
infinity_mass,
value_discretization_interval,
pessimistic_estimate=pessimistic_estimate)
rounded_probability_mass_function, infinity_mass,
value_discretization_interval, pessimistic_estimate,
)

infinity_mass_add, rounded_probability_mass_function_add = _create_rounded_probability_mass_function(
log_probability_mass_function_lower=log_probability_mass_function_upper,
log_probability_mass_function_upper=log_probability_mass_function_lower)
infinity_mass_add, rounded_probability_mass_function_add = (
_create_rounded_probability_mass_function(
log_pmf_lower=log_probability_mass_function_upper,
log_pmf_upper=log_probability_mass_function_lower,
)
)
return PrivacyLossDistribution.create_from_rounded_probability(
rounded_probability_mass_function, infinity_mass,
value_discretization_interval, pessimistic_estimate,
rounded_probability_mass_function_add, infinity_mass_add)
rounded_probability_mass_function_add, infinity_mass_add,
symmetric=False,
)


def _create_pld_pmf_from_additive_noise(
Expand Down Expand Up @@ -846,6 +845,14 @@ def _create_pld_pmf_from_additive_noise(
if math.ceil(scaled_epsilon) > rounded_epsilons[-1]:
rounded_epsilons.append(math.ceil(scaled_epsilon))
rounded_epsilons = np.array(rounded_epsilons)

deltas = additive_noise_privacy_loss.get_delta_for_epsilon(
rounded_epsilons * value_discretization_interval)

return pld_pmf.create_pmf_pessimistic_connect_dots(
value_discretization_interval,
rounded_epsilons,
deltas)
else:
if (connect_dots_bounds.epsilon_upper is None or
connect_dots_bounds.epsilon_lower is None):
Expand All @@ -859,20 +866,15 @@ def _create_pld_pmf_from_additive_noise(
rounded_epsilons = np.arange(rounded_epsilon_lower,
rounded_epsilon_upper + 1)

deltas = additive_noise_privacy_loss.get_delta_for_epsilon(
rounded_epsilons * value_discretization_interval)
deltas = additive_noise_privacy_loss.get_delta_for_epsilon(
rounded_epsilons * value_discretization_interval)

if additive_noise_privacy_loss.discrete_noise:
return pld_pmf.create_pmf_pessimistic_connect_dots(
# Use a specialized numerically stable approach for continuous noise
return pld_pmf.create_pmf_pessimistic_connect_dots_fixed_gap(
value_discretization_interval,
rounded_epsilons,
rounded_epsilon_lower,
rounded_epsilon_upper,
deltas)
# Else use specialized numerically stable approach for continuous noise
return pld_pmf.create_pmf_pessimistic_connect_dots_fixed_gap(
value_discretization_interval,
rounded_epsilon_lower,
rounded_epsilon_upper,
deltas)

round_fn = math.ceil if pessimistic_estimate else math.floor

Expand Down Expand Up @@ -1039,7 +1041,7 @@ def from_randomized_response(

if noise_parameter <= 0 or noise_parameter >= 1:
raise ValueError(f'Noise parameter must be strictly between 0 and 1: '
f'{noise_parameter}')
f'Found {noise_parameter}')

if num_buckets <= 1:
raise ValueError(
Expand Down Expand Up @@ -1429,3 +1431,4 @@ def from_privacy_parameters(
return PrivacyLossDistribution.create_from_rounded_probability(
rounded_probability_mass_function, privacy_parameters.delta,
value_discretization_interval)

Loading

0 comments on commit 0e99a6f

Please sign in to comment.