Changed random_state default to None in fair_stratified_train_test_split. Renamed preprocessing to preparation in fairness code to avoid collision with AIF360 terminology. (#615)

hirzel · web-flow · commit 1f0339748e90 · 2021-02-26T16:11:50.000-05:00
diff --git a/docs/img/fairness_patterns.png b/docs/img/fairness_patterns.png
diff --git a/examples/demo_aif360.ipynb b/examples/demo_aif360.ipynb
@@ -593,7 +593,7 @@
    "source": [
     "from lale.lib.aif360 import fair_stratified_train_test_split\n",
     "train_X, test_X, train_y, test_y = fair_stratified_train_test_split(\n",
-    "    all_X, all_y, **fairness_info, test_size=0.33)"
+    "    all_X, all_y, **fairness_info, test_size=0.33, random_state=42)"
    ]
   },
   {
@@ -1093,7 +1093,7 @@
     "In the visualization, light blue indicates trainable operators\n",
     "and dark blue indicates that automation must make a choice before\n",
     "the operators can be trained. Compared to the earlier pipeline,\n",
-    "we pass the preprocessing as an argument to `DisparateImpactRemover`,\n",
+    "we pass the data preparation sub-pipeline as an argument to `DisparateImpactRemover`,\n",
     "since that fairness mitigator needs numerical data to work on."
    ]
   },
@@ -1120,7 +1120,7 @@
        "</a>\n",
        "</g>\n",
        "<g id=\"clust1\" class=\"cluster\"><title>cluster:disparate_impact_remover</title>\n",
-       "<g id=\"a_clust1\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.aif360.disparate_impact_remover.html\" xlink:title=\"disparate_impact_remover = DisparateImpactRemover(favorable_labels=[&#39;good&#39;], protected_attributes=[{&#39;feature&#39;: &#39;personal_status&#39;, &#39;privileged_groups&#39;: [&#39;male div/sep&#39;, &#39;male mar/wid&#39;, &#39;male single&#39;]}, {&#39;feature&#39;: &#39;age&#39;, &#39;privileged_groups&#39;: [[26, 1000]]}], preprocessing=pipeline_0)\">\n",
+       "<g id=\"a_clust1\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.aif360.disparate_impact_remover.html\" xlink:title=\"disparate_impact_remover = DisparateImpactRemover(favorable_labels=[&#39;good&#39;], protected_attributes=[{&#39;feature&#39;: &#39;personal_status&#39;, &#39;privileged_groups&#39;: [&#39;male div/sep&#39;, &#39;male mar/wid&#39;, &#39;male single&#39;]}, {&#39;feature&#39;: &#39;age&#39;, &#39;privileged_groups&#39;: [[26, 1000]]}], preparation=pipeline_0)\">\n",
        "<polygon fill=\"#b0e2ff\" stroke=\"black\" points=\"8,-59 8,-213 296.108,-213 296.108,-59 8,-59\"/>\n",
        "<text text-anchor=\"middle\" x=\"152.054\" y=\"-197.8\" font-family=\"Times,serif\" font-size=\"14.00\">DisparateImpactRemover</text>\n",
        "</a>\n",
@@ -1231,7 +1231,7 @@
    ],
    "source": [
     "di_remover = DisparateImpactRemover(\n",
-    "    **fairness_info, preprocessing=prep_to_numbers)\n",
+    "    **fairness_info, preparation=prep_to_numbers)\n",
     "planned_fairer = di_remover >> (LR | Tree | KNN)\n",
     "planned_fairer.visualize()"
    ]
@@ -1347,7 +1347,7 @@
        "</a>\n",
        "</g>\n",
        "<g id=\"clust1\" class=\"cluster\"><title>cluster:disparate_impact_remover</title>\n",
-       "<g id=\"a_clust1\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.aif360.disparate_impact_remover.html\" xlink:title=\"disparate_impact_remover = DisparateImpactRemover(favorable_labels=[&#39;good&#39;], protected_attributes=[{&#39;feature&#39;: &#39;personal_status&#39;, &#39;privileged_groups&#39;: [&#39;male div/sep&#39;, &#39;male mar/wid&#39;, &#39;male single&#39;]}, {&#39;feature&#39;: &#39;age&#39;, &#39;privileged_groups&#39;: [[26, 1000]]}], preprocessing=pipeline_0, repair_level=0.8641...)\">\n",
+       "<g id=\"a_clust1\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.aif360.disparate_impact_remover.html\" xlink:title=\"disparate_impact_remover = DisparateImpactRemover(favorable_labels=[&#39;good&#39;], protected_attributes=[{&#39;feature&#39;: &#39;personal_status&#39;, &#39;privileged_groups&#39;: [&#39;male div/sep&#39;, &#39;male mar/wid&#39;, &#39;male single&#39;]}, {&#39;feature&#39;: &#39;age&#39;, &#39;privileged_groups&#39;: [[26, 1000]]}], preparation=pipeline_0, repair_level=0.8641...)\">\n",
        "<polygon fill=\"white\" stroke=\"black\" points=\"8,-8 8,-162 296.108,-162 296.108,-8 8,-8\"/>\n",
        "<text text-anchor=\"middle\" x=\"152.054\" y=\"-146.8\" font-family=\"Times,serif\" font-size=\"14.00\">DisparateImpactRemover</text>\n",
        "</a>\n",
@@ -1472,7 +1472,7 @@
        "        },\n",
        "        {\"feature\": \"age\", \"privileged_groups\": [[26, 1000]]},\n",
        "    ],\n",
-       "    preprocessing=((project >> one_hot_encoder) & project_0)\n",
+       "    preparation=((project >> one_hot_encoder) & project_0)\n",
        "    >> ConcatFeatures(),\n",
        "    repair_level=0.8641279154649505,\n",
        ")\n",
diff --git a/lale/lib/aif360/__init__.py b/lale/lib/aif360/__init__.py
@@ -105,21 +105,21 @@
 
     pipeline = LFR(
         **fairness_info,
-        preprocessing=(
+        preparation=(
             (Project(columns={"type": "string"}) >> OneHotEncoder(handle_unknown="ignore"))
             & Project(columns={"type": "number"})
         )
         >> ConcatFeatures
     ) >> LogisticRegression(max_iter=1000)
 
 In this example, the *mitigator* is LFR (which is pre-estimator), the
-*estimator* is LogisticRegression, and the *preprocessing* is a
+*estimator* is LogisticRegression, and the *preparation* is a
 sub-pipeline that one-hot-encodes strings. If all features of the data
-are numerical, then the preprocessing can be omitted. Internally, the
+are numerical, then the preparation can be omitted. Internally, the
 LFR higher-order operator uses two auxiliary operators, Redacting
 and ProtectedAttributesEncoder.  Redacting sets protected attributes
 to a constant to prevent them from directly influencing
-fairness-agnostic preprocessing or estimators. And the
+fairness-agnostic data preparation or estimators. And the
 ProtectedAttributesEncoder encodes protected attributes and labels as
 zero or one to simplify the task for the mitigator.
 
diff --git a/lale/lib/aif360/adversarial_debiasing.py b/lale/lib/aif360/adversarial_debiasing.py
@@ -39,7 +39,7 @@ def __init__(
         favorable_labels,
         protected_attributes,
         redact=True,
-        preprocessing=None,
+        preparation=None,
         scope_name="adversarial_debiasing",
         sess=None,
         seed=None,
@@ -75,7 +75,7 @@ def __init__(
             favorable_labels=favorable_labels,
             protected_attributes=protected_attributes,
             redact=redact,
-            preprocessing=preprocessing,
+            preparation=preparation,
             mitigator=mitigator,
         )
 
@@ -94,7 +94,7 @@ def __init__(
             "required": [
                 *_categorical_fairness_properties.keys(),
                 "redact",
-                "preprocessing",
+                "preparation",
                 "scope_name",
                 "sess",
                 "seed",
@@ -113,11 +113,11 @@ def __init__(
             "properties": {
                 **_categorical_fairness_properties,
                 "redact": {
-                    "description": "Whether to redact protected attributes before preprocessing (recommended) or not.",
+                    "description": "Whether to redact protected attributes before data preparation (recommended) or not.",
                     "type": "boolean",
                     "default": True,
                 },
-                "preprocessing": {
+                "preparation": {
                     "description": "Transformer, which may be an individual operator or a sub-pipeline.",
                     "anyOf": [
                         {"laleType": "operator"},
diff --git a/lale/lib/aif360/calibrated_eq_odds_postprocessing.py b/lale/lib/aif360/calibrated_eq_odds_postprocessing.py
@@ -81,7 +81,7 @@ def __init__(
                     "laleType": "operator",
                 },
                 "redact": {
-                    "description": "Whether to redact protected attributes before preprocessing (recommended) or not.",
+                    "description": "Whether to redact protected attributes before data preparation (recommended) or not.",
                     "type": "boolean",
                     "default": True,
                 },
diff --git a/lale/lib/aif360/disparate_impact_remover.py b/lale/lib/aif360/disparate_impact_remover.py
@@ -37,15 +37,15 @@ def __init__(
         favorable_labels,
         protected_attributes,
         redact=True,
-        preprocessing=None,
+        preparation=None,
         repair_level=1.0,
     ):
         self.favorable_labels = favorable_labels
         self.protected_attributes = protected_attributes
         self.redact = redact
-        if preprocessing is None:
-            preprocessing = lale.lib.lale.NoOp
-        self.preprocessing = preprocessing
+        if preparation is None:
+            preparation = lale.lib.lale.NoOp
+        self.preparation = preparation
         self.repair_level = repair_level
 
     def _prep_and_encode(self, X, y=None):
@@ -79,8 +79,8 @@ def fit(self, X, y=None):
             "protected_attributes": self.protected_attributes,
         }
         redacting = Redacting(**fairness_info) if self.redact else lale.lib.lale.NoOp
-        preprocessing = self.preprocessing
-        trainable_redact_and_prep = redacting >> preprocessing
+        preparation = self.preparation
+        trainable_redact_and_prep = redacting >> preparation
         assert isinstance(trainable_redact_and_prep, lale.operators.TrainablePipeline)
         self.redact_and_prep = trainable_redact_and_prep.fit(X, y)
         self.prot_attr_enc = ProtectedAttributesEncoder(
@@ -130,18 +130,18 @@ def transform(self, X):
             "required": [
                 *_categorical_fairness_properties.keys(),
                 "redact",
-                "preprocessing",
+                "preparation",
                 "repair_level",
             ],
             "relevantToOptimizer": ["repair_level"],
             "properties": {
                 **_categorical_fairness_properties,
                 "redact": {
-                    "description": "Whether to redact protected attributes before preprocessing (recommended) or not.",
+                    "description": "Whether to redact protected attributes before data preparation (recommended) or not.",
                     "type": "boolean",
                     "default": True,
                 },
-                "preprocessing": {
+                "preparation": {
                     "description": "Transformer, which may be an individual operator or a sub-pipeline.",
                     "anyOf": [
                         {"laleType": "operator"},
diff --git a/lale/lib/aif360/eq_odds_postprocessing.py b/lale/lib/aif360/eq_odds_postprocessing.py
@@ -73,7 +73,7 @@ def __init__(
                     "laleType": "operator",
                 },
                 "redact": {
-                    "description": "Whether to redact protected attributes before preprocessing (recommended) or not.",
+                    "description": "Whether to redact protected attributes before data preparation (recommended) or not.",
                     "type": "boolean",
                     "default": True,
                 },
diff --git a/lale/lib/aif360/gerry_fair_classifier.py b/lale/lib/aif360/gerry_fair_classifier.py
@@ -33,7 +33,7 @@ def __init__(
         favorable_labels,
         protected_attributes,
         redact=True,
-        preprocessing=None,
+        preparation=None,
         C=10,
         printflag=False,
         heatmapflag=False,
@@ -68,7 +68,7 @@ def __init__(
             favorable_labels=favorable_labels,
             protected_attributes=protected_attributes,
             redact=redact,
-            preprocessing=preprocessing,
+            preparation=preparation,
             mitigator=mitigator,
         )
 
@@ -87,7 +87,7 @@ def __init__(
             "required": [
                 *_categorical_fairness_properties.keys(),
                 "redact",
-                "preprocessing",
+                "preparation",
                 "C",
                 "printflag",
                 "heatmapflag",
@@ -102,11 +102,11 @@ def __init__(
             "properties": {
                 **_categorical_fairness_properties,
                 "redact": {
-                    "description": "Whether to redact protected attributes before preprocessing (recommended) or not.",
+                    "description": "Whether to redact protected attributes before data preparation (recommended) or not.",
                     "type": "boolean",
                     "default": True,
                 },
-                "preprocessing": {
+                "preparation": {
                     "description": "Transformer, which may be an individual operator or a sub-pipeline.",
                     "anyOf": [
                         {"laleType": "operator"},
diff --git a/lale/lib/aif360/lfr.py b/lale/lib/aif360/lfr.py
@@ -38,7 +38,7 @@ def __init__(
         favorable_labels,
         protected_attributes,
         redact=True,
-        preprocessing=None,
+        preparation=None,
         k=5,
         Ax=0.01,
         Az=1.0,
@@ -50,9 +50,9 @@ def __init__(
         self.favorable_labels = favorable_labels
         self.protected_attributes = protected_attributes
         self.redact = redact
-        if preprocessing is None:
-            preprocessing = lale.lib.lale.NoOp
-        self.preprocessing = preprocessing
+        if preparation is None:
+            preparation = lale.lib.lale.NoOp
+        self.preparation = preparation
         prot_attr_names = [pa["feature"] for pa in protected_attributes]
         unprivileged_groups = [{name: 0 for name in prot_attr_names}]
         privileged_groups = [{name: 1 for name in prot_attr_names}]
@@ -93,8 +93,8 @@ def fit(self, X, y):
             "protected_attributes": self.protected_attributes,
         }
         redacting = Redacting(**fairness_info) if self.redact else lale.lib.lale.NoOp
-        preprocessing = self.preprocessing
-        trainable_redact1_and_prep = redacting >> preprocessing
+        preparation = self.preparation
+        trainable_redact1_and_prep = redacting >> preparation
         assert isinstance(trainable_redact1_and_prep, lale.operators.TrainablePipeline)
         self.redact1_and_prep = trainable_redact1_and_prep.fit(X, y)
         self.prot_attr_enc = ProtectedAttributesEncoder(
@@ -132,7 +132,7 @@ def transform(self, X):
             "required": [
                 *_categorical_fairness_properties.keys(),
                 "redact",
-                "preprocessing",
+                "preparation",
                 "k",
                 "Ax",
                 "Az",
@@ -145,11 +145,11 @@ def transform(self, X):
             "properties": {
                 **_categorical_fairness_properties,
                 "redact": {
-                    "description": "Whether to redact protected attributes before preprocessing (recommended) or not.",
+                    "description": "Whether to redact protected attributes before data preparation (recommended) or not.",
                     "type": "boolean",
                     "default": True,
                 },
-                "preprocessing": {
+                "preparation": {
                     "description": "Transformer, which may be an individual operator or a sub-pipeline.",
                     "anyOf": [
                         {"laleType": "operator"},
diff --git a/lale/lib/aif360/meta_fair_classifier.py b/lale/lib/aif360/meta_fair_classifier.py
@@ -26,7 +26,7 @@ def __init__(
         favorable_labels,
         protected_attributes,
         redact=True,
-        preprocessing=None,
+        preparation=None,
         tau=0.8,
         type="fdr",
     ):
@@ -38,7 +38,7 @@ def __init__(
             favorable_labels=favorable_labels,
             protected_attributes=protected_attributes,
             redact=redact,
-            preprocessing=preprocessing,
+            preparation=preparation,
             mitigator=mitigator,
         )
 
@@ -100,19 +100,19 @@ def __init__(
             "required": [
                 *_categorical_fairness_properties.keys(),
                 "redact",
-                "preprocessing",
+                "preparation",
                 "tau",
                 "type",
             ],
             "relevantToOptimizer": ["tau", "type"],
             "properties": {
                 **_categorical_fairness_properties,
                 "redact": {
-                    "description": "Whether to redact protected attributes before preprocessing (recommended) or not.",
+                    "description": "Whether to redact protected attributes before data preparation (recommended) or not.",
                     "type": "boolean",
                     "default": True,
                 },
-                "preprocessing": {
+                "preparation": {
                     "description": "Transformer, which may be an individual operator or a sub-pipeline.",
                     "anyOf": [
                         {"laleType": "operator"},
diff --git a/lale/lib/aif360/prejudice_remover.py b/lale/lib/aif360/prejudice_remover.py
@@ -32,15 +32,15 @@ def __init__(
         favorable_labels,
         protected_attributes,
         redact=True,
-        preprocessing=None,
+        preparation=None,
         eta=1.0,
     ):
         mitigator = aif360.algorithms.inprocessing.PrejudiceRemover(eta=eta)
         super(PrejudiceRemoverImpl, self).__init__(
             favorable_labels=favorable_labels,
             protected_attributes=protected_attributes,
             redact=redact,
-            preprocessing=preprocessing,
+            preparation=preparation,
             mitigator=mitigator,
         )
 
@@ -59,18 +59,18 @@ def __init__(
             "required": [
                 *_categorical_fairness_properties.keys(),
                 "redact",
-                "preprocessing",
+                "preparation",
                 "eta",
             ],
             "relevantToOptimizer": ["eta"],
             "properties": {
                 **_categorical_fairness_properties,
                 "redact": {
-                    "description": "Whether to redact protected attributes before preprocessing (recommended) or not.",
+                    "description": "Whether to redact protected attributes before data preparation (recommended) or not.",
                     "type": "boolean",
                     "default": True,
                 },
-                "preprocessing": {
+                "preparation": {
                     "description": "Transformer, which may be an individual operator or a sub-pipeline.",
                     "anyOf": [
                         {"laleType": "operator"},
diff --git a/lale/lib/aif360/protected_attributes_encoder.py b/lale/lib/aif360/protected_attributes_encoder.py
@@ -167,7 +167,7 @@
 protected attributes suitable as input for downstream fairness
 mitigation operators. This operator does not encode the remaining
 (non-protected) attributes. A common usage is to encode non-protected
-attributes with a separate preprocessing pipeline and to perform a
+attributes with a separate data preparation pipeline and to perform a
 feature union before piping the transformed data to downstream
 operators that require numeric data.
 """,
diff --git a/lale/lib/aif360/reject_option_classification.py b/lale/lib/aif360/reject_option_classification.py
@@ -96,7 +96,7 @@ def __init__(
                     "laleType": "operator",
                 },
                 "redact": {
-                    "description": "Whether to redact protected attributes before preprocessing (recommended) or not.",
+                    "description": "Whether to redact protected attributes before data preparation (recommended) or not.",
                     "type": "boolean",
                     "default": True,
                 },
diff --git a/lale/lib/aif360/reweighing.py b/lale/lib/aif360/reweighing.py
@@ -110,7 +110,7 @@ def predict(self, X):
                     "laleType": "operator",
                 },
                 "redact": {
-                    "description": "Whether to redact protected attributes before preprocessing (recommended) or not.",
+                    "description": "Whether to redact protected attributes before data preparation (recommended) or not.",
                     "type": "boolean",
                     "default": True,
                 },
diff --git a/lale/lib/aif360/util.py b/lale/lib/aif360/util.py
diff --git a/talks/fairness_patterns.pptx b/talks/fairness_patterns.pptx
diff --git a/test/test_aif360.py b/test/test_aif360.py