Update sklearn notebooks to use 0.23-1 (aws#1329)

Edward J Kim · web-flow · commit c4491228ce9f · 2020-07-28T16:51:51.000-07:00
diff --git a/sagemaker-python-sdk/scikit_learn_inference_pipeline/Inference Pipeline with Scikit-learn and Linear Learner.ipynb b/sagemaker-python-sdk/scikit_learn_inference_pipeline/Inference Pipeline with Scikit-learn and Linear Learner.ipynb
@@ -40,17 +40,17 @@
    },
    "outputs": [],
    "source": [
-    "# S3 prefix\n",
-    "bucket = '< ENTER BUCKET NAME HERE >'\n",
-    "prefix = 'Scikit-LinearLearner-pipeline-abalone-example'\n",
-    "\n",
     "import sagemaker\n",
     "from sagemaker import get_execution_role\n",
     "\n",
     "sagemaker_session = sagemaker.Session()\n",
     "\n",
     "# Get a SageMaker-compatible role used by this Notebook Instance.\n",
-    "role = get_execution_role()"
+    "role = get_execution_role()\n",
+    "\n",
+    "# S3 prefix\n",
+    "bucket = sagemaker_session.default_bucket()\n",
+    "prefix = 'Scikit-LinearLearner-pipeline-abalone-example'"
    ]
   },
   {
@@ -123,13 +123,13 @@
     "import argparse\n",
     "import csv\n",
     "import json\n",
+    "import joblib\n",
     "import numpy as np\n",
     "import pandas as pd\n",
     "\n",
-    "from sklearn.compose import ColumnTransformer\n",
-    "from sklearn.externals import joblib\n",
+    "from sklearn.compose import ColumnTransformer, make_column_selector\n",
     "from sklearn.impute import SimpleImputer\n",
-    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.pipeline import make_pipeline\n",
     "from sklearn.preprocessing import Binarizer, StandardScaler, OneHotEncoder\n",
     "\n",
     "from sagemaker_containers.beta.framework import (\n",
@@ -149,16 +149,16 @@
     "label_column = 'rings'\n",
     "\n",
     "feature_columns_dtype = {\n",
-    "    'sex': str,\n",
-    "    'length': np.float64,\n",
-    "    'diameter': np.float64,\n",
-    "    'height': np.float64,\n",
-    "    'whole_weight': np.float64,\n",
-    "    'shucked_weight': np.float64,\n",
-    "    'viscera_weight': np.float64,\n",
-    "    'shell_weight': np.float64}\n",
+    "    'sex': \"category\",\n",
+    "    'length': \"float64\",\n",
+    "    'diameter': \"float64\",\n",
+    "    'height': \"float64\",\n",
+    "    'whole_weight': \"float64\",\n",
+    "    'shucked_weight': \"float64\",\n",
+    "    'viscera_weight': \"float64\",\n",
+    "    'shell_weight': \"float64\"}\n",
     "\n",
-    "label_column_dtype = {'rings': np.float64} # +1.5 gives the age in years\n",
+    "label_column_dtype = {'rings': \"float64\"} # +1.5 gives the age in years\n",
     "\n",
     "def merge_two_dicts(x, y):\n",
     "    z = x.copy()   # start with x's keys and values\n",
@@ -190,7 +190,10 @@
     "        names=feature_columns_names + [label_column],\n",
     "        dtype=merge_two_dicts(feature_columns_dtype, label_column_dtype)) for file in input_files ]\n",
     "    concat_data = pd.concat(raw_data)\n",
-    "    \n",
+    "\n",
+    "    # Labels should not be preprocessed. predict_fn will reinsert the labels after featurizing.\n",
+    "    concat_data.drop(label_column, axis=1, inplace=True)\n",
+    "\n",
     "    # This section is adapted from the scikit-learn example of using preprocessing pipelines:\n",
     "    #\n",
     "    # https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html\n",
@@ -206,22 +209,17 @@
     "    # - shell_weight: Weight after being dried\n",
     "    # Categorical Features:\n",
     "    # - sex: categories encoded as strings {'M', 'F', 'I'} where 'I' is Infant\n",
-    "    numeric_features = list(feature_columns_names)\n",
-    "    numeric_features.remove('sex')\n",
-    "    numeric_transformer = Pipeline(steps=[\n",
-    "        ('imputer', SimpleImputer(strategy='median')),\n",
-    "        ('scaler', StandardScaler())])\n",
-    "\n",
-    "    categorical_features = ['sex']\n",
-    "    categorical_transformer = Pipeline(steps=[\n",
-    "        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),\n",
-    "        ('onehot', OneHotEncoder(handle_unknown='ignore'))])\n",
-    "\n",
-    "    preprocessor = ColumnTransformer(\n",
-    "        transformers=[\n",
-    "            ('num', numeric_transformer, numeric_features),\n",
-    "            ('cat', categorical_transformer, categorical_features)],\n",
-    "        remainder=\"drop\")\n",
+    "    numeric_transformer = make_pipeline(\n",
+    "        SimpleImputer(strategy='median'),\n",
+    "        StandardScaler())\n",
+    "\n",
+    "    categorical_transformer = make_pipeline(\n",
+    "        SimpleImputer(strategy='constant', fill_value='missing'),\n",
+    "        OneHotEncoder(handle_unknown='ignore'))\n",
+    "\n",
+    "    preprocessor = ColumnTransformer(transformers=[\n",
+    "            (\"num\", numeric_transformer, make_column_selector(dtype_exclude=\"category\")),\n",
+    "            (\"cat\", categorical_transformer, make_column_selector(dtype_include=\"category\"))])\n",
     "    \n",
     "    preprocessor.fit(concat_data)\n",
     "\n",
@@ -268,9 +266,9 @@
     "\n",
     "        json_output = {\"instances\": instances}\n",
     "\n",
-    "        return worker.Response(json.dumps(json_output), accept, mimetype=accept)\n",
+    "        return worker.Response(json.dumps(json_output), mimetype=accept)\n",
     "    elif accept == 'text/csv':\n",
-    "        return worker.Response(encoders.encode(prediction, accept), accept, mimetype=accept)\n",
+    "        return worker.Response(encoders.encode(prediction, accept), mimetype=accept)\n",
     "    else:\n",
     "        raise RuntimeException(\"{} accept type is not supported by this script.\".format(accept))\n",
     "\n",
@@ -286,7 +284,7 @@
     "        rest of features either one hot encoded or standardized\n",
     "    \"\"\"\n",
     "    features = model.transform(input_data)\n",
-    "    \n",
+    "\n",
     "    if label_column in input_data:\n",
     "        # Return the label (as the first column) and the set of features.\n",
     "        return np.insert(features, 0, input_data[label_column], axis=1)\n",
@@ -313,6 +311,7 @@
     "\n",
     "* __entry_point__: The path to the Python script SageMaker runs for training and prediction.\n",
     "* __role__: Role ARN\n",
+    "* __framework_version__: Scikit-learn version you want to use for executing your model training code.\n",
     "* __train_instance_type__ *(optional)*: The type of SageMaker instances for training. __Note__: Because Scikit-learn does not natively support GPU training, Sagemaker Scikit-learn does not currently support training on GPU instance types.\n",
     "* __sagemaker_session__ *(optional)*: The session used to train on Sagemaker.\n",
     "\n",
@@ -327,11 +326,13 @@
    "source": [
     "from sagemaker.sklearn.estimator import SKLearn\n",
     "\n",
+    "FRAMEWORK_VERSION = \"0.23-1\"\n",
     "script_path = 'sklearn_abalone_featurizer.py'\n",
     "\n",
     "sklearn_preprocessor = SKLearn(\n",
     "    entry_point=script_path,\n",
     "    role=role,\n",
+    "    framework_version=FRAMEWORK_VERSION,\n",
     "    train_instance_type=\"ml.c4.xlarge\",\n",
     "    sagemaker_session=sagemaker_session)\n"
    ]
@@ -362,7 +363,7 @@
     "# Define a SKLearn Transformer from the trained SKLearn Estimator\n",
     "transformer = sklearn_preprocessor.transformer(\n",
     "    instance_count=1, \n",
-    "    instance_type='ml.m4.xlarge',\n",
+    "    instance_type='ml.m5.xlarge',\n",
     "    assemble_with = 'Line',\n",
     "    accept = 'text/csv')"
    ]
@@ -374,8 +375,8 @@
    "outputs": [],
    "source": [
     "# Preprocess training input\n",
-    "transformer.transform(train_input, content_type='text/csv')\n",
-    "print('Waiting for transform job: ' + transformer.latest_transform_job.job_name)\n",
+    "transformer.transform(train_input, content_type=\"text/csv\")\n",
+    "print(\"Waiting for transform job: \" + transformer.latest_transform_job.job_name)\n",
     "transformer.wait()\n",
     "preprocessed_train = transformer.output_path"
    ]
@@ -524,13 +525,6 @@
     "sm_client = sagemaker_session.boto_session.client('sagemaker')\n",
     "sm_client.delete_endpoint(EndpointName=endpoint_name)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/sagemaker-python-sdk/scikit_learn_inference_pipeline/sklearn_abalone_featurizer.py b/sagemaker-python-sdk/scikit_learn_inference_pipeline/sklearn_abalone_featurizer.py
@@ -9,13 +9,13 @@
 import argparse
 import csv
 import json
+import joblib
 import numpy as np
 import pandas as pd
 
-from sklearn.compose import ColumnTransformer
-from sklearn.externals import joblib
+from sklearn.compose import ColumnTransformer, make_column_selector
 from sklearn.impute import SimpleImputer
-from sklearn.pipeline import Pipeline
+from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import Binarizer, StandardScaler, OneHotEncoder
 
 from sagemaker_containers.beta.framework import (
@@ -35,16 +35,16 @@
 label_column = 'rings'
 
 feature_columns_dtype = {
-    'sex': str,
-    'length': np.float64,
-    'diameter': np.float64,
-    'height': np.float64,
-    'whole_weight': np.float64,
-    'shucked_weight': np.float64,
-    'viscera_weight': np.float64,
-    'shell_weight': np.float64}
+    'sex': "category",
+    'length': "float64",
+    'diameter': "float64",
+    'height': "float64",
+    'whole_weight': "float64",
+    'shucked_weight': "float64",
+    'viscera_weight': "float64",
+    'shell_weight': "float64"}
 
-label_column_dtype = {'rings': np.float64} # +1.5 gives the age in years
+label_column_dtype = {'rings': "float64"} # +1.5 gives the age in years
 
 def merge_two_dicts(x, y):
     z = x.copy()   # start with x's keys and values
@@ -76,7 +76,10 @@ def merge_two_dicts(x, y):
         names=feature_columns_names + [label_column],
         dtype=merge_two_dicts(feature_columns_dtype, label_column_dtype)) for file in input_files ]
     concat_data = pd.concat(raw_data)
-    
+
+    # Labels should not be preprocessed. predict_fn will reinsert the labels after featurizing.
+    concat_data.drop(label_column, axis=1, inplace=True)
+
     # This section is adapted from the scikit-learn example of using preprocessing pipelines:
     #
     # https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html
@@ -92,22 +95,17 @@ def merge_two_dicts(x, y):
     # - shell_weight: Weight after being dried
     # Categorical Features:
     # - sex: categories encoded as strings {'M', 'F', 'I'} where 'I' is Infant
-    numeric_features = list(feature_columns_names)
-    numeric_features.remove('sex')
-    numeric_transformer = Pipeline(steps=[
-        ('imputer', SimpleImputer(strategy='median')),
-        ('scaler', StandardScaler())])
-
-    categorical_features = ['sex']
-    categorical_transformer = Pipeline(steps=[
-        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
-        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
-
-    preprocessor = ColumnTransformer(
-        transformers=[
-            ('num', numeric_transformer, numeric_features),
-            ('cat', categorical_transformer, categorical_features)],
-        remainder="drop")
+    numeric_transformer = make_pipeline(
+        SimpleImputer(strategy='median'),
+        StandardScaler())
+
+    categorical_transformer = make_pipeline(
+        SimpleImputer(strategy='constant', fill_value='missing'),
+        OneHotEncoder(handle_unknown='ignore'))
+
+    preprocessor = ColumnTransformer(transformers=[
+            ("num", numeric_transformer, make_column_selector(dtype_exclude="category")),
+            ("cat", categorical_transformer, make_column_selector(dtype_include="category"))])
     
     preprocessor.fit(concat_data)
 
@@ -172,7 +170,7 @@ def predict_fn(input_data, model):
         rest of features either one hot encoded or standardized
     """
     features = model.transform(input_data)
-    
+
     if label_column in input_data:
         # Return the label (as the first column) and the set of features.
         return np.insert(features, 0, input_data[label_column], axis=1)
@@ -185,4 +183,4 @@ def model_fn(model_dir):
     """Deserialize fitted model
     """
     preprocessor = joblib.load(os.path.join(model_dir, "model.joblib"))
-    return preprocessor
+    return preprocessor
diff --git a/sagemaker-python-sdk/scikit_learn_iris/Scikit-learn Estimator Example With Batch Transform.ipynb b/sagemaker-python-sdk/scikit_learn_iris/Scikit-learn Estimator Example With Batch Transform.ipynb
@@ -115,12 +115,14 @@
     "A typical training script loads data from the input channels, configures training with hyperparameters, trains a model, and saves a model to model_dir so that it can be hosted later. Hyperparameters are passed to your script as arguments and can be retrieved with an `argparse.ArgumentParser` instance. For example, the script that we will run in this notebook is the below:\n",
     "\n",
     "```python\n",
+    "from __future__ import print_function\n",
+    "\n",
     "import argparse\n",
-    "import pandas as pd\n",
+    "import joblib\n",
     "import os\n",
+    "import pandas as pd\n",
     "\n",
     "from sklearn import tree\n",
-    "from sklearn.externals import joblib\n",
     "\n",
     "\n",
     "if __name__ == '__main__':\n",
@@ -147,8 +149,8 @@
     "    train_data = pd.concat(raw_data)\n",
     "\n",
     "    # labels are in the first column\n",
-    "    train_y = train_data.ix[:,0]\n",
-    "    train_X = train_data.ix[:,1:]\n",
+    "    train_y = train_data.iloc[:, 0]\n",
+    "    train_X = train_data.iloc[:, 1:]\n",
     "\n",
     "    # Here we support a single hyperparameter, 'max_leaf_nodes'. Note that you can add as many\n",
     "    # as your training my require in the ArgumentParser above.\n",
@@ -206,10 +208,12 @@
    "source": [
     "from sagemaker.sklearn.estimator import SKLearn\n",
     "\n",
+    "FRAMEWORK_VERSION = \"0.23-1\"\n",
     "script_path = 'scikit_learn_iris.py'\n",
     "\n",
     "sklearn = SKLearn(\n",
     "    entry_point=script_path,\n",
+    "    framework_version=FRAMEWORK_VERSION,\n",
     "    train_instance_type=\"ml.c4.xlarge\",\n",
     "    role=role,\n",
     "    sagemaker_session=sagemaker_session,\n",
@@ -252,7 +256,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "predictor = sklearn.deploy(initial_instance_count=1, instance_type=\"ml.m4.xlarge\")"
+    "predictor = sklearn.deploy(initial_instance_count=1, instance_type=\"ml.m5.xlarge\")"
    ]
   },
   {
@@ -334,7 +338,7 @@
    "outputs": [],
    "source": [
     "# Define a SKLearn Transformer from the trained SKLearn Estimator\n",
-    "transformer = sklearn.transformer(instance_count=1, instance_type='ml.m4.xlarge')"
+    "transformer = sklearn.transformer(instance_count=1, instance_type='ml.m5.xlarge')"
    ]
   },
   {
@@ -427,13 +431,6 @@
     "        | sed \"s/\\/dev\\/fd\\/63/batch_data\\/output\\/iris_sample_X_${i}.csv.out/\"\n",
     "done"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
@@ -452,7 +449,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.5"
+   "version": "3.6.10"
   }
  },
  "nbformat": 4,
diff --git a/sagemaker-python-sdk/scikit_learn_iris/scikit_learn_iris.py b/sagemaker-python-sdk/scikit_learn_iris/scikit_learn_iris.py
@@ -14,11 +14,11 @@
 from __future__ import print_function
 
 import argparse
+import joblib
 import os
 import pandas as pd
 
 from sklearn import tree
-from sklearn.externals import joblib
 
 
 if __name__ == '__main__':
@@ -45,8 +45,8 @@
     train_data = pd.concat(raw_data)
 
     # labels are in the first column
-    train_y = train_data.ix[:,0]
-    train_X = train_data.ix[:,1:]
+    train_y = train_data.iloc[:, 0]
+    train_X = train_data.iloc[:, 1:]
 
     # Here we support a single hyperparameter, 'max_leaf_nodes'. Note that you can add as many
     # as your training my require in the ArgumentParser above.
diff --git a/sagemaker-python-sdk/scikit_learn_randomforest/Sklearn_on_SageMaker_end2end.ipynb b/sagemaker-python-sdk/scikit_learn_randomforest/Sklearn_on_SageMaker_end2end.ipynb