Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

false positive risk #83

Merged
merged 1 commit into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[tool]
[tool.poetry]
name = "ep-stats"
version = "2.4.0"
version = "2.5.0"
homepage = "https://github.com/avast/ep-stats"
description = "Statistical package to evaluate ab tests in experimentation platform."
authors = [
Expand Down
8 changes: 8 additions & 0 deletions src/epstats/server/req.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,13 @@ class Experiment(BaseModel):
description="""List of filtering conditions to apply on exposure and goals.""",
)

null_hypothesis_rate: Optional[float] = Field(
None,
title="Null hypothesis rate",
description="""Global null hypothesis rate of the experimentation program. It is defined as the
proportion of all tests in an experimentation program that have not improved or degraded the primary metric.""",
)

query_parameters: dict = Field(
{},
title="Custom query parameters used in the data access.",
Expand Down Expand Up @@ -334,6 +341,7 @@ def to_experiment(self):
unit_type=self.unit_type,
variants=self.variants,
filters=[f.to_filter() for f in self.filters] if self.filters else [],
null_hypothesis_rate=self.null_hypothesis_rate,
query_parameters=self.query_parameters,
)

Expand Down
4 changes: 4 additions & 0 deletions src/epstats/server/res.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@ class MetricStat(BaseModel):
title="Power",
description="Test power based on the collected `sample_size`.",
)
false_positive_risk: Optional[float] = Field(
None, title="False positive risk.", description="False positive risk of a statistically significant result."
)

@staticmethod
def from_df(df: pd.DataFrame):
Expand All @@ -88,6 +91,7 @@ def from_df(df: pd.DataFrame):
sample_size=r["sample_size"],
required_sample_size=r["required_sample_size"],
power=r["power"],
false_positive_risk=r["false_positive_risk"],
)
for i, r in df.iterrows()
]
Expand Down
21 changes: 21 additions & 0 deletions src/epstats/toolkit/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def metric_columns(cls) -> List[str]:
1. `sample_size` - current sample size
1. `required_sample_size` - size of the sample required to reach the required power
1. `power` - power based on the collected `sample_size`
1. `false_positive_risk` - false positive risk of a significant metric
"""
return [
"timestamp",
Expand All @@ -76,6 +77,7 @@ def metric_columns(cls) -> List[str]:
"sample_size",
"required_sample_size",
"power",
"false_positive_risk",
]

@classmethod
Expand Down Expand Up @@ -156,6 +158,7 @@ def __init__(
confidence_level: float = DEFAULT_CONFIDENCE_LEVEL,
variants: Optional[List[str]] = None,
filters: Optional[List[Filter]] = None,
null_hypothesis_rate: Optional[float] = None,
query_parameters: dict = {},
):
self._logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
Expand Down Expand Up @@ -189,6 +192,7 @@ def __init__(
self._update_dimension_to_value()
self.filters = filters if filters is not None else []
self.query_parameters = query_parameters
self.null_hypothesis_rate = null_hypothesis_rate

def _check_metric_ids_unique(self):
"""
Expand Down Expand Up @@ -765,6 +769,22 @@ def _get_power_from_required_sample_sizes(self, metrics: pd.DataFrame, n_variant
axis=1,
)

def _get_false_positive_risk(self, metric_row: pd.Series) -> float:
if self.null_hypothesis_rate is None:
return np.nan

if metric_row["p_value"] >= metric_row["confidence_level"]:
return np.nan

return Statistics.false_positive_risk(
null_hypothesis_rate=self.null_hypothesis_rate,
power=metric_row["power"],
p_value=metric_row["p_value"],
)

def _get_false_positive_risks(self, metrics: pd.DataFrame) -> pd.Series:
return metrics.apply(self._get_false_positive_risk, axis=1)

def _evaluate_metrics(self, goals: pd.DataFrame, column_fce) -> pd.DataFrame:
if not self.metrics:
return pd.DataFrame([], columns=Evaluation.metric_columns())
Expand Down Expand Up @@ -822,4 +842,5 @@ def _evaluate_metrics(self, goals: pd.DataFrame, column_fce) -> pd.DataFrame:
c["timestamp"] = round(get_utc_timestamp(datetime.now()).timestamp())
c[["minimum_effect", "sample_size", "required_sample_size"]] = self._get_required_sample_sizes(c, n_variants)
c["power"] = self._get_power_from_required_sample_sizes(c, n_variants)
c["false_positive_risk"] = self._get_false_positive_risks(c)
return c[Evaluation.metric_columns()]
33 changes: 33 additions & 0 deletions src/epstats/toolkit/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,3 +432,36 @@ def power_from_required_sample_size_per_variant(
np.sqrt(required_sample_size_ratio) * (st.norm.ppf(1 - alpha / 2) + st.norm.ppf(required_power))
- st.norm.ppf(1 - alpha / 2)
)

@staticmethod
def false_positive_risk(
null_hypothesis_rate: float,
power: float,
p_value: float,
) -> float:
"""
Computes false positive risk defined as:

$$
P(H_0|S) = \\frac{P(S|H_0)P(H_0)}{P(S)} = \\frac{\\alpha\\pi}{\\alpha\\pi + (1 - \\beta)(1 - \\pi)}
$$

where $S$ is a statisically significant outcome, $H_0$ is the null hypothesis, $1 - \\beta$
is the power of a test, and $\\pi$ is the global null hypothesis rate defined as the proportion
of all tests in an experimentation program that have not improved or degraded the primary metric.

False positive risk $P(H_0|S)$ is not the same as the false positive rate $P(S|H_0) = \\alpha$.

More information can be found in the paper: https://bit.ly/ABTestingIntuitionBusters.

Arguments:
null_hypothesis_rate: global null hypothesis rate of the experimanation program
current_power: power achieved in the test
confidence_level: confidence level of the test

Returns:
false positive risk
"""

pi = null_hypothesis_rate
return (p_value * pi) / (p_value * pi + power * (1 - pi))
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ test-multi-check c test_unit_type global exposure 5200
test-conversion-with-minimum-effect a test_unit_type global exposure 21
test-conversion-with-minimum-effect b test_unit_type global exposure 26
test-conversion-with-minimum-effect c test_unit_type global exposure 30
test-false-positive-risk a test_unit_type global exposure 1000
test-false-positive-risk b test_unit_type global exposure 1001
test-dim-operators a test_unit_type global exposure 1000
test-dim-operators b test_unit_type global exposure 1001
test-operator-precedence a test_unit_type global exposure 80
Expand Down
Loading
Loading