|
40 | 40 | },
|
41 | 41 | "outputs": [],
|
42 | 42 | "source": [
|
43 |
| - "# S3 prefix\n", |
44 |
| - "bucket = '< ENTER BUCKET NAME HERE >'\n", |
45 |
| - "prefix = 'Scikit-LinearLearner-pipeline-abalone-example'\n", |
46 |
| - "\n", |
47 | 43 | "import sagemaker\n",
|
48 | 44 | "from sagemaker import get_execution_role\n",
|
49 | 45 | "\n",
|
50 | 46 | "sagemaker_session = sagemaker.Session()\n",
|
51 | 47 | "\n",
|
52 | 48 | "# Get a SageMaker-compatible role used by this Notebook Instance.\n",
|
53 |
| - "role = get_execution_role()" |
| 49 | + "role = get_execution_role()\n", |
| 50 | + "\n", |
| 51 | + "# S3 prefix\n", |
| 52 | + "bucket = sagemaker_session.default_bucket()\n", |
| 53 | + "prefix = 'Scikit-LinearLearner-pipeline-abalone-example'" |
54 | 54 | ]
|
55 | 55 | },
|
56 | 56 | {
|
|
123 | 123 | "import argparse\n",
|
124 | 124 | "import csv\n",
|
125 | 125 | "import json\n",
|
| 126 | + "import joblib\n", |
126 | 127 | "import numpy as np\n",
|
127 | 128 | "import pandas as pd\n",
|
128 | 129 | "\n",
|
129 |
| - "from sklearn.compose import ColumnTransformer\n", |
130 |
| - "from sklearn.externals import joblib\n", |
| 130 | + "from sklearn.compose import ColumnTransformer, make_column_selector\n", |
131 | 131 | "from sklearn.impute import SimpleImputer\n",
|
132 |
| - "from sklearn.pipeline import Pipeline\n", |
| 132 | + "from sklearn.pipeline import make_pipeline\n", |
133 | 133 | "from sklearn.preprocessing import Binarizer, StandardScaler, OneHotEncoder\n",
|
134 | 134 | "\n",
|
135 | 135 | "from sagemaker_containers.beta.framework import (\n",
|
|
149 | 149 | "label_column = 'rings'\n",
|
150 | 150 | "\n",
|
151 | 151 | "feature_columns_dtype = {\n",
|
152 |
| - " 'sex': str,\n", |
153 |
| - " 'length': np.float64,\n", |
154 |
| - " 'diameter': np.float64,\n", |
155 |
| - " 'height': np.float64,\n", |
156 |
| - " 'whole_weight': np.float64,\n", |
157 |
| - " 'shucked_weight': np.float64,\n", |
158 |
| - " 'viscera_weight': np.float64,\n", |
159 |
| - " 'shell_weight': np.float64}\n", |
| 152 | + " 'sex': \"category\",\n", |
| 153 | + " 'length': \"float64\",\n", |
| 154 | + " 'diameter': \"float64\",\n", |
| 155 | + " 'height': \"float64\",\n", |
| 156 | + " 'whole_weight': \"float64\",\n", |
| 157 | + " 'shucked_weight': \"float64\",\n", |
| 158 | + " 'viscera_weight': \"float64\",\n", |
| 159 | + " 'shell_weight': \"float64\"}\n", |
160 | 160 | "\n",
|
161 |
| - "label_column_dtype = {'rings': np.float64} # +1.5 gives the age in years\n", |
| 161 | + "label_column_dtype = {'rings': \"float64\"} # +1.5 gives the age in years\n", |
162 | 162 | "\n",
|
163 | 163 | "def merge_two_dicts(x, y):\n",
|
164 | 164 | " z = x.copy() # start with x's keys and values\n",
|
|
190 | 190 | " names=feature_columns_names + [label_column],\n",
|
191 | 191 | " dtype=merge_two_dicts(feature_columns_dtype, label_column_dtype)) for file in input_files ]\n",
|
192 | 192 | " concat_data = pd.concat(raw_data)\n",
|
193 |
| - " \n", |
| 193 | + "\n", |
| 194 | + " # Labels should not be preprocessed. predict_fn will reinsert the labels after featurizing.\n", |
| 195 | + " concat_data.drop(label_column, axis=1, inplace=True)\n", |
| 196 | + "\n", |
194 | 197 | " # This section is adapted from the scikit-learn example of using preprocessing pipelines:\n",
|
195 | 198 | " #\n",
|
196 | 199 | " # https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html\n",
|
|
206 | 209 | " # - shell_weight: Weight after being dried\n",
|
207 | 210 | " # Categorical Features:\n",
|
208 | 211 | " # - sex: categories encoded as strings {'M', 'F', 'I'} where 'I' is Infant\n",
|
209 |
| - " numeric_features = list(feature_columns_names)\n", |
210 |
| - " numeric_features.remove('sex')\n", |
211 |
| - " numeric_transformer = Pipeline(steps=[\n", |
212 |
| - " ('imputer', SimpleImputer(strategy='median')),\n", |
213 |
| - " ('scaler', StandardScaler())])\n", |
214 |
| - "\n", |
215 |
| - " categorical_features = ['sex']\n", |
216 |
| - " categorical_transformer = Pipeline(steps=[\n", |
217 |
| - " ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),\n", |
218 |
| - " ('onehot', OneHotEncoder(handle_unknown='ignore'))])\n", |
219 |
| - "\n", |
220 |
| - " preprocessor = ColumnTransformer(\n", |
221 |
| - " transformers=[\n", |
222 |
| - " ('num', numeric_transformer, numeric_features),\n", |
223 |
| - " ('cat', categorical_transformer, categorical_features)],\n", |
224 |
| - " remainder=\"drop\")\n", |
| 212 | + " numeric_transformer = make_pipeline(\n", |
| 213 | + " SimpleImputer(strategy='median'),\n", |
| 214 | + " StandardScaler())\n", |
| 215 | + "\n", |
| 216 | + " categorical_transformer = make_pipeline(\n", |
| 217 | + " SimpleImputer(strategy='constant', fill_value='missing'),\n", |
| 218 | + " OneHotEncoder(handle_unknown='ignore'))\n", |
| 219 | + "\n", |
| 220 | + " preprocessor = ColumnTransformer(transformers=[\n", |
| 221 | + " (\"num\", numeric_transformer, make_column_selector(dtype_exclude=\"category\")),\n", |
| 222 | + " (\"cat\", categorical_transformer, make_column_selector(dtype_include=\"category\"))])\n", |
225 | 223 | " \n",
|
226 | 224 | " preprocessor.fit(concat_data)\n",
|
227 | 225 | "\n",
|
|
268 | 266 | "\n",
|
269 | 267 | " json_output = {\"instances\": instances}\n",
|
270 | 268 | "\n",
|
271 |
| - " return worker.Response(json.dumps(json_output), accept, mimetype=accept)\n", |
| 269 | + " return worker.Response(json.dumps(json_output), mimetype=accept)\n", |
272 | 270 | " elif accept == 'text/csv':\n",
|
273 |
| - " return worker.Response(encoders.encode(prediction, accept), accept, mimetype=accept)\n", |
| 271 | + " return worker.Response(encoders.encode(prediction, accept), mimetype=accept)\n", |
274 | 272 | " else:\n",
|
275 | 273 | " raise RuntimeException(\"{} accept type is not supported by this script.\".format(accept))\n",
|
276 | 274 | "\n",
|
|
286 | 284 | " rest of features either one hot encoded or standardized\n",
|
287 | 285 | " \"\"\"\n",
|
288 | 286 | " features = model.transform(input_data)\n",
|
289 |
| - " \n", |
| 287 | + "\n", |
290 | 288 | " if label_column in input_data:\n",
|
291 | 289 | " # Return the label (as the first column) and the set of features.\n",
|
292 | 290 | " return np.insert(features, 0, input_data[label_column], axis=1)\n",
|
|
313 | 311 | "\n",
|
314 | 312 | "* __entry_point__: The path to the Python script SageMaker runs for training and prediction.\n",
|
315 | 313 | "* __role__: Role ARN\n",
|
| 314 | + "* __framework_version__: Scikit-learn version you want to use for executing your model training code.\n", |
316 | 315 | "* __train_instance_type__ *(optional)*: The type of SageMaker instances for training. __Note__: Because Scikit-learn does not natively support GPU training, Sagemaker Scikit-learn does not currently support training on GPU instance types.\n",
|
317 | 316 | "* __sagemaker_session__ *(optional)*: The session used to train on Sagemaker.\n",
|
318 | 317 | "\n",
|
|
327 | 326 | "source": [
|
328 | 327 | "from sagemaker.sklearn.estimator import SKLearn\n",
|
329 | 328 | "\n",
|
| 329 | + "FRAMEWORK_VERSION = \"0.23-1\"\n", |
330 | 330 | "script_path = 'sklearn_abalone_featurizer.py'\n",
|
331 | 331 | "\n",
|
332 | 332 | "sklearn_preprocessor = SKLearn(\n",
|
333 | 333 | " entry_point=script_path,\n",
|
334 | 334 | " role=role,\n",
|
| 335 | + " framework_version=FRAMEWORK_VERSION,\n", |
335 | 336 | " train_instance_type=\"ml.c4.xlarge\",\n",
|
336 | 337 | " sagemaker_session=sagemaker_session)\n"
|
337 | 338 | ]
|
|
362 | 363 | "# Define a SKLearn Transformer from the trained SKLearn Estimator\n",
|
363 | 364 | "transformer = sklearn_preprocessor.transformer(\n",
|
364 | 365 | " instance_count=1, \n",
|
365 |
| - " instance_type='ml.m4.xlarge',\n", |
| 366 | + " instance_type='ml.m5.xlarge',\n", |
366 | 367 | " assemble_with = 'Line',\n",
|
367 | 368 | " accept = 'text/csv')"
|
368 | 369 | ]
|
|
374 | 375 | "outputs": [],
|
375 | 376 | "source": [
|
376 | 377 | "# Preprocess training input\n",
|
377 |
| - "transformer.transform(train_input, content_type='text/csv')\n", |
378 |
| - "print('Waiting for transform job: ' + transformer.latest_transform_job.job_name)\n", |
| 378 | + "transformer.transform(train_input, content_type=\"text/csv\")\n", |
| 379 | + "print(\"Waiting for transform job: \" + transformer.latest_transform_job.job_name)\n", |
379 | 380 | "transformer.wait()\n",
|
380 | 381 | "preprocessed_train = transformer.output_path"
|
381 | 382 | ]
|
|
524 | 525 | "sm_client = sagemaker_session.boto_session.client('sagemaker')\n",
|
525 | 526 | "sm_client.delete_endpoint(EndpointName=endpoint_name)"
|
526 | 527 | ]
|
527 |
| - }, |
528 |
| - { |
529 |
| - "cell_type": "code", |
530 |
| - "execution_count": null, |
531 |
| - "metadata": {}, |
532 |
| - "outputs": [], |
533 |
| - "source": [] |
534 | 528 | }
|
535 | 529 | ],
|
536 | 530 | "metadata": {
|
|
0 commit comments