Skip to content

Commit c449122

Browse files
author
Edward J Kim
authored
Update sklearn notebooks to use 0.23-1 (aws#1329)
1 parent cafaf99 commit c449122

File tree

5 files changed

+93
-100
lines changed

5 files changed

+93
-100
lines changed

Diff for: sagemaker-python-sdk/scikit_learn_inference_pipeline/Inference Pipeline with Scikit-learn and Linear Learner.ipynb

+41-47
Original file line numberDiff line numberDiff line change
@@ -40,17 +40,17 @@
4040
},
4141
"outputs": [],
4242
"source": [
43-
"# S3 prefix\n",
44-
"bucket = '< ENTER BUCKET NAME HERE >'\n",
45-
"prefix = 'Scikit-LinearLearner-pipeline-abalone-example'\n",
46-
"\n",
4743
"import sagemaker\n",
4844
"from sagemaker import get_execution_role\n",
4945
"\n",
5046
"sagemaker_session = sagemaker.Session()\n",
5147
"\n",
5248
"# Get a SageMaker-compatible role used by this Notebook Instance.\n",
53-
"role = get_execution_role()"
49+
"role = get_execution_role()\n",
50+
"\n",
51+
"# S3 prefix\n",
52+
"bucket = sagemaker_session.default_bucket()\n",
53+
"prefix = 'Scikit-LinearLearner-pipeline-abalone-example'"
5454
]
5555
},
5656
{
@@ -123,13 +123,13 @@
123123
"import argparse\n",
124124
"import csv\n",
125125
"import json\n",
126+
"import joblib\n",
126127
"import numpy as np\n",
127128
"import pandas as pd\n",
128129
"\n",
129-
"from sklearn.compose import ColumnTransformer\n",
130-
"from sklearn.externals import joblib\n",
130+
"from sklearn.compose import ColumnTransformer, make_column_selector\n",
131131
"from sklearn.impute import SimpleImputer\n",
132-
"from sklearn.pipeline import Pipeline\n",
132+
"from sklearn.pipeline import make_pipeline\n",
133133
"from sklearn.preprocessing import Binarizer, StandardScaler, OneHotEncoder\n",
134134
"\n",
135135
"from sagemaker_containers.beta.framework import (\n",
@@ -149,16 +149,16 @@
149149
"label_column = 'rings'\n",
150150
"\n",
151151
"feature_columns_dtype = {\n",
152-
" 'sex': str,\n",
153-
" 'length': np.float64,\n",
154-
" 'diameter': np.float64,\n",
155-
" 'height': np.float64,\n",
156-
" 'whole_weight': np.float64,\n",
157-
" 'shucked_weight': np.float64,\n",
158-
" 'viscera_weight': np.float64,\n",
159-
" 'shell_weight': np.float64}\n",
152+
" 'sex': \"category\",\n",
153+
" 'length': \"float64\",\n",
154+
" 'diameter': \"float64\",\n",
155+
" 'height': \"float64\",\n",
156+
" 'whole_weight': \"float64\",\n",
157+
" 'shucked_weight': \"float64\",\n",
158+
" 'viscera_weight': \"float64\",\n",
159+
" 'shell_weight': \"float64\"}\n",
160160
"\n",
161-
"label_column_dtype = {'rings': np.float64} # +1.5 gives the age in years\n",
161+
"label_column_dtype = {'rings': \"float64\"} # +1.5 gives the age in years\n",
162162
"\n",
163163
"def merge_two_dicts(x, y):\n",
164164
" z = x.copy() # start with x's keys and values\n",
@@ -190,7 +190,10 @@
190190
" names=feature_columns_names + [label_column],\n",
191191
" dtype=merge_two_dicts(feature_columns_dtype, label_column_dtype)) for file in input_files ]\n",
192192
" concat_data = pd.concat(raw_data)\n",
193-
" \n",
193+
"\n",
194+
" # Labels should not be preprocessed. predict_fn will reinsert the labels after featurizing.\n",
195+
" concat_data.drop(label_column, axis=1, inplace=True)\n",
196+
"\n",
194197
" # This section is adapted from the scikit-learn example of using preprocessing pipelines:\n",
195198
" #\n",
196199
" # https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html\n",
@@ -206,22 +209,17 @@
206209
" # - shell_weight: Weight after being dried\n",
207210
" # Categorical Features:\n",
208211
" # - sex: categories encoded as strings {'M', 'F', 'I'} where 'I' is Infant\n",
209-
" numeric_features = list(feature_columns_names)\n",
210-
" numeric_features.remove('sex')\n",
211-
" numeric_transformer = Pipeline(steps=[\n",
212-
" ('imputer', SimpleImputer(strategy='median')),\n",
213-
" ('scaler', StandardScaler())])\n",
214-
"\n",
215-
" categorical_features = ['sex']\n",
216-
" categorical_transformer = Pipeline(steps=[\n",
217-
" ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),\n",
218-
" ('onehot', OneHotEncoder(handle_unknown='ignore'))])\n",
219-
"\n",
220-
" preprocessor = ColumnTransformer(\n",
221-
" transformers=[\n",
222-
" ('num', numeric_transformer, numeric_features),\n",
223-
" ('cat', categorical_transformer, categorical_features)],\n",
224-
" remainder=\"drop\")\n",
212+
" numeric_transformer = make_pipeline(\n",
213+
" SimpleImputer(strategy='median'),\n",
214+
" StandardScaler())\n",
215+
"\n",
216+
" categorical_transformer = make_pipeline(\n",
217+
" SimpleImputer(strategy='constant', fill_value='missing'),\n",
218+
" OneHotEncoder(handle_unknown='ignore'))\n",
219+
"\n",
220+
" preprocessor = ColumnTransformer(transformers=[\n",
221+
" (\"num\", numeric_transformer, make_column_selector(dtype_exclude=\"category\")),\n",
222+
" (\"cat\", categorical_transformer, make_column_selector(dtype_include=\"category\"))])\n",
225223
" \n",
226224
" preprocessor.fit(concat_data)\n",
227225
"\n",
@@ -268,9 +266,9 @@
268266
"\n",
269267
" json_output = {\"instances\": instances}\n",
270268
"\n",
271-
" return worker.Response(json.dumps(json_output), accept, mimetype=accept)\n",
269+
" return worker.Response(json.dumps(json_output), mimetype=accept)\n",
272270
" elif accept == 'text/csv':\n",
273-
" return worker.Response(encoders.encode(prediction, accept), accept, mimetype=accept)\n",
271+
" return worker.Response(encoders.encode(prediction, accept), mimetype=accept)\n",
274272
" else:\n",
275273
" raise RuntimeException(\"{} accept type is not supported by this script.\".format(accept))\n",
276274
"\n",
@@ -286,7 +284,7 @@
286284
" rest of features either one hot encoded or standardized\n",
287285
" \"\"\"\n",
288286
" features = model.transform(input_data)\n",
289-
" \n",
287+
"\n",
290288
" if label_column in input_data:\n",
291289
" # Return the label (as the first column) and the set of features.\n",
292290
" return np.insert(features, 0, input_data[label_column], axis=1)\n",
@@ -313,6 +311,7 @@
313311
"\n",
314312
"* __entry_point__: The path to the Python script SageMaker runs for training and prediction.\n",
315313
"* __role__: Role ARN\n",
314+
"* __framework_version__: Scikit-learn version you want to use for executing your model training code.\n",
316315
"* __train_instance_type__ *(optional)*: The type of SageMaker instances for training. __Note__: Because Scikit-learn does not natively support GPU training, Sagemaker Scikit-learn does not currently support training on GPU instance types.\n",
317316
"* __sagemaker_session__ *(optional)*: The session used to train on Sagemaker.\n",
318317
"\n",
@@ -327,11 +326,13 @@
327326
"source": [
328327
"from sagemaker.sklearn.estimator import SKLearn\n",
329328
"\n",
329+
"FRAMEWORK_VERSION = \"0.23-1\"\n",
330330
"script_path = 'sklearn_abalone_featurizer.py'\n",
331331
"\n",
332332
"sklearn_preprocessor = SKLearn(\n",
333333
" entry_point=script_path,\n",
334334
" role=role,\n",
335+
" framework_version=FRAMEWORK_VERSION,\n",
335336
" train_instance_type=\"ml.c4.xlarge\",\n",
336337
" sagemaker_session=sagemaker_session)\n"
337338
]
@@ -362,7 +363,7 @@
362363
"# Define a SKLearn Transformer from the trained SKLearn Estimator\n",
363364
"transformer = sklearn_preprocessor.transformer(\n",
364365
" instance_count=1, \n",
365-
" instance_type='ml.m4.xlarge',\n",
366+
" instance_type='ml.m5.xlarge',\n",
366367
" assemble_with = 'Line',\n",
367368
" accept = 'text/csv')"
368369
]
@@ -374,8 +375,8 @@
374375
"outputs": [],
375376
"source": [
376377
"# Preprocess training input\n",
377-
"transformer.transform(train_input, content_type='text/csv')\n",
378-
"print('Waiting for transform job: ' + transformer.latest_transform_job.job_name)\n",
378+
"transformer.transform(train_input, content_type=\"text/csv\")\n",
379+
"print(\"Waiting for transform job: \" + transformer.latest_transform_job.job_name)\n",
379380
"transformer.wait()\n",
380381
"preprocessed_train = transformer.output_path"
381382
]
@@ -524,13 +525,6 @@
524525
"sm_client = sagemaker_session.boto_session.client('sagemaker')\n",
525526
"sm_client.delete_endpoint(EndpointName=endpoint_name)"
526527
]
527-
},
528-
{
529-
"cell_type": "code",
530-
"execution_count": null,
531-
"metadata": {},
532-
"outputs": [],
533-
"source": []
534528
}
535529
],
536530
"metadata": {

Diff for: sagemaker-python-sdk/scikit_learn_inference_pipeline/sklearn_abalone_featurizer.py

+29-31
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,13 @@
99
import argparse
1010
import csv
1111
import json
12+
import joblib
1213
import numpy as np
1314
import pandas as pd
1415

15-
from sklearn.compose import ColumnTransformer
16-
from sklearn.externals import joblib
16+
from sklearn.compose import ColumnTransformer, make_column_selector
1717
from sklearn.impute import SimpleImputer
18-
from sklearn.pipeline import Pipeline
18+
from sklearn.pipeline import make_pipeline
1919
from sklearn.preprocessing import Binarizer, StandardScaler, OneHotEncoder
2020

2121
from sagemaker_containers.beta.framework import (
@@ -35,16 +35,16 @@
3535
label_column = 'rings'
3636

3737
feature_columns_dtype = {
38-
'sex': str,
39-
'length': np.float64,
40-
'diameter': np.float64,
41-
'height': np.float64,
42-
'whole_weight': np.float64,
43-
'shucked_weight': np.float64,
44-
'viscera_weight': np.float64,
45-
'shell_weight': np.float64}
38+
'sex': "category",
39+
'length': "float64",
40+
'diameter': "float64",
41+
'height': "float64",
42+
'whole_weight': "float64",
43+
'shucked_weight': "float64",
44+
'viscera_weight': "float64",
45+
'shell_weight': "float64"}
4646

47-
label_column_dtype = {'rings': np.float64} # +1.5 gives the age in years
47+
label_column_dtype = {'rings': "float64"} # +1.5 gives the age in years
4848

4949
def merge_two_dicts(x, y):
5050
z = x.copy() # start with x's keys and values
@@ -76,7 +76,10 @@ def merge_two_dicts(x, y):
7676
names=feature_columns_names + [label_column],
7777
dtype=merge_two_dicts(feature_columns_dtype, label_column_dtype)) for file in input_files ]
7878
concat_data = pd.concat(raw_data)
79-
79+
80+
# Labels should not be preprocessed. predict_fn will reinsert the labels after featurizing.
81+
concat_data.drop(label_column, axis=1, inplace=True)
82+
8083
# This section is adapted from the scikit-learn example of using preprocessing pipelines:
8184
#
8285
# https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html
@@ -92,22 +95,17 @@ def merge_two_dicts(x, y):
9295
# - shell_weight: Weight after being dried
9396
# Categorical Features:
9497
# - sex: categories encoded as strings {'M', 'F', 'I'} where 'I' is Infant
95-
numeric_features = list(feature_columns_names)
96-
numeric_features.remove('sex')
97-
numeric_transformer = Pipeline(steps=[
98-
('imputer', SimpleImputer(strategy='median')),
99-
('scaler', StandardScaler())])
100-
101-
categorical_features = ['sex']
102-
categorical_transformer = Pipeline(steps=[
103-
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
104-
('onehot', OneHotEncoder(handle_unknown='ignore'))])
105-
106-
preprocessor = ColumnTransformer(
107-
transformers=[
108-
('num', numeric_transformer, numeric_features),
109-
('cat', categorical_transformer, categorical_features)],
110-
remainder="drop")
98+
numeric_transformer = make_pipeline(
99+
SimpleImputer(strategy='median'),
100+
StandardScaler())
101+
102+
categorical_transformer = make_pipeline(
103+
SimpleImputer(strategy='constant', fill_value='missing'),
104+
OneHotEncoder(handle_unknown='ignore'))
105+
106+
preprocessor = ColumnTransformer(transformers=[
107+
("num", numeric_transformer, make_column_selector(dtype_exclude="category")),
108+
("cat", categorical_transformer, make_column_selector(dtype_include="category"))])
111109

112110
preprocessor.fit(concat_data)
113111

@@ -172,7 +170,7 @@ def predict_fn(input_data, model):
172170
rest of features either one hot encoded or standardized
173171
"""
174172
features = model.transform(input_data)
175-
173+
176174
if label_column in input_data:
177175
# Return the label (as the first column) and the set of features.
178176
return np.insert(features, 0, input_data[label_column], axis=1)
@@ -185,4 +183,4 @@ def model_fn(model_dir):
185183
"""Deserialize fitted model
186184
"""
187185
preprocessor = joblib.load(os.path.join(model_dir, "model.joblib"))
188-
return preprocessor
186+
return preprocessor

Diff for: sagemaker-python-sdk/scikit_learn_iris/Scikit-learn Estimator Example With Batch Transform.ipynb

+11-14
Original file line numberDiff line numberDiff line change
@@ -115,12 +115,14 @@
115115
"A typical training script loads data from the input channels, configures training with hyperparameters, trains a model, and saves a model to model_dir so that it can be hosted later. Hyperparameters are passed to your script as arguments and can be retrieved with an `argparse.ArgumentParser` instance. For example, the script that we will run in this notebook is the below:\n",
116116
"\n",
117117
"```python\n",
118+
"from __future__ import print_function\n",
119+
"\n",
118120
"import argparse\n",
119-
"import pandas as pd\n",
121+
"import joblib\n",
120122
"import os\n",
123+
"import pandas as pd\n",
121124
"\n",
122125
"from sklearn import tree\n",
123-
"from sklearn.externals import joblib\n",
124126
"\n",
125127
"\n",
126128
"if __name__ == '__main__':\n",
@@ -147,8 +149,8 @@
147149
" train_data = pd.concat(raw_data)\n",
148150
"\n",
149151
" # labels are in the first column\n",
150-
" train_y = train_data.ix[:,0]\n",
151-
" train_X = train_data.ix[:,1:]\n",
152+
" train_y = train_data.iloc[:, 0]\n",
153+
" train_X = train_data.iloc[:, 1:]\n",
152154
"\n",
153155
" # Here we support a single hyperparameter, 'max_leaf_nodes'. Note that you can add as many\n",
154156
" # as your training my require in the ArgumentParser above.\n",
@@ -206,10 +208,12 @@
206208
"source": [
207209
"from sagemaker.sklearn.estimator import SKLearn\n",
208210
"\n",
211+
"FRAMEWORK_VERSION = \"0.23-1\"\n",
209212
"script_path = 'scikit_learn_iris.py'\n",
210213
"\n",
211214
"sklearn = SKLearn(\n",
212215
" entry_point=script_path,\n",
216+
" framework_version=FRAMEWORK_VERSION,\n",
213217
" train_instance_type=\"ml.c4.xlarge\",\n",
214218
" role=role,\n",
215219
" sagemaker_session=sagemaker_session,\n",
@@ -252,7 +256,7 @@
252256
"metadata": {},
253257
"outputs": [],
254258
"source": [
255-
"predictor = sklearn.deploy(initial_instance_count=1, instance_type=\"ml.m4.xlarge\")"
259+
"predictor = sklearn.deploy(initial_instance_count=1, instance_type=\"ml.m5.xlarge\")"
256260
]
257261
},
258262
{
@@ -334,7 +338,7 @@
334338
"outputs": [],
335339
"source": [
336340
"# Define a SKLearn Transformer from the trained SKLearn Estimator\n",
337-
"transformer = sklearn.transformer(instance_count=1, instance_type='ml.m4.xlarge')"
341+
"transformer = sklearn.transformer(instance_count=1, instance_type='ml.m5.xlarge')"
338342
]
339343
},
340344
{
@@ -427,13 +431,6 @@
427431
" | sed \"s/\\/dev\\/fd\\/63/batch_data\\/output\\/iris_sample_X_${i}.csv.out/\"\n",
428432
"done"
429433
]
430-
},
431-
{
432-
"cell_type": "code",
433-
"execution_count": null,
434-
"metadata": {},
435-
"outputs": [],
436-
"source": []
437434
}
438435
],
439436
"metadata": {
@@ -452,7 +449,7 @@
452449
"name": "python",
453450
"nbconvert_exporter": "python",
454451
"pygments_lexer": "ipython3",
455-
"version": "3.6.5"
452+
"version": "3.6.10"
456453
}
457454
},
458455
"nbformat": 4,

Diff for: sagemaker-python-sdk/scikit_learn_iris/scikit_learn_iris.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,11 @@
1414
from __future__ import print_function
1515

1616
import argparse
17+
import joblib
1718
import os
1819
import pandas as pd
1920

2021
from sklearn import tree
21-
from sklearn.externals import joblib
2222

2323

2424
if __name__ == '__main__':
@@ -45,8 +45,8 @@
4545
train_data = pd.concat(raw_data)
4646

4747
# labels are in the first column
48-
train_y = train_data.ix[:,0]
49-
train_X = train_data.ix[:,1:]
48+
train_y = train_data.iloc[:, 0]
49+
train_X = train_data.iloc[:, 1:]
5050

5151
# Here we support a single hyperparameter, 'max_leaf_nodes'. Note that you can add as many
5252
# as your training my require in the ArgumentParser above.

0 commit comments

Comments
 (0)