Skip to content

Commit

Permalink
PR changes
Browse files Browse the repository at this point in the history
  • Loading branch information
dmitrypolo committed Aug 7, 2018
1 parent c63493c commit a0b1896
Show file tree
Hide file tree
Showing 10 changed files with 87 additions and 123 deletions.
2 changes: 1 addition & 1 deletion cookiecutter.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@
"s3_bucket": "[OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')",
"aws_profile": "default",
"python_interpreter": ["python3", "python"],
"include_starter_proj": ["N", "Y"]
"include_starter_proj": ["n", "y"]
}
2 changes: 1 addition & 1 deletion hooks/post_gen_project.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import shutil

DIRS = ['src/data', 'src/models']
DIRS = ['src/models']
CWD = os.getcwd()

if '{{ cookiecutter.include_starter_proj }}' == 'N':
Expand Down
5 changes: 5 additions & 0 deletions {{ cookiecutter.repo_name }}/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,11 @@ requirements: test_environment

## Make Dataset
data: requirements
wget --progress=bar:force -O data/raw/transfusion_data_raw.csv https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data
$(PYTHON_INTERPRETER) src/data/make_dataset.py

{% if cookiecutter.include_starter_proj == 'Y' %}

## Train Model
train: data
$(PYTHON_INTERPRETER) src/models/train_model.py
Expand All @@ -37,6 +40,8 @@ train: data
model_pipeline: train
$(PYTHON_INTERPRETER) src/models/predict_model.py

{% endif %}

## Delete all compiled Python files
clean:
find . -type f -name "*.py[co]" -delete
Expand Down
1 change: 0 additions & 1 deletion {{ cookiecutter.repo_name }}/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
name='src',
packages=find_packages(),
version='0.1.0',
install_requires=['{% if cookiecutter.include_starter_proj == "Y" %}scikit-learn[alldeps]{% endif %}'],
description='{{ cookiecutter.description }}',
author='{{ cookiecutter.author_name }}',
license='{% if cookiecutter.open_source_license == "MIT" %}MIT{% elif cookiecutter.open_source_license == "BSD-3-Clause" %}BSD-3{% endif %}',
Expand Down
70 changes: 21 additions & 49 deletions {{ cookiecutter.repo_name }}/src/data/make_dataset.py
Original file line number Diff line number Diff line change
@@ -1,68 +1,40 @@
import os
import logging
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from dotenv import find_dotenv, load_dotenv

DATA_LINK = 'http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv'
TITLES = ['Mlle', 'Mrs', 'Mr', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms', 'Major', 'Col', 'Capt', 'Countess']
ROOT = Path(__file__).resolve().parents[2]


def extract_title(name):
title = 'missing'
for item in TITLES:
if item in name:
title = item
break
if title == 'missing':
title = 'Mr'
return title


def massage_data(raw_data):
""" Preprocess the data for predictions
"""
preprocess the data for predictions
"""
# Feature engineering ---
raw_data["title"] = raw_data.apply(lambda row: extract_title(row["name"]), axis=1)

# Age: replace NaN with median
raw_data["age"].fillna(raw_data.age.median(), inplace=True)

# Embarked: replace NaN with the mode value
raw_data["embarked"].fillna(raw_data.embarked.mode()[0], inplace=True)

# Fare: replace NaN with median
raw_data["fare"].fillna(raw_data.fare.median(), inplace=True)

# Encode Categorical features ---
raw_data["cabin"] = raw_data.apply(lambda obs: "No" if pd.isnull(obs['cabin']) else "Yes", axis=1) # binarize “cabin” feature
raw_data = pd.get_dummies(raw_data, columns=['sex', 'title', 'cabin', 'embarked'])
raw_data.rename(index=str, columns={"whether he/she donated blood in March 2007": "label"}, inplace=True)

# generate features for year for time columns
for x, y in zip(['time_years', 'recency_years'], ['Time (months)', 'Recency (months)']):
raw_data[x] = (raw_data[y] / 12).astype('int')

# generate features for quarter for time columns (3 month periods)
for x, y in zip(['time_quarters', 'recency_quarters'], ['Time (months)', 'Recency (months)']):
raw_data[x] = (raw_data[y] / 3).astype('int')

# Scaling numerical features ---
scale = StandardScaler().fit(raw_data[['age', 'fare']])
raw_data[['age', 'fare']] = scale.transform(raw_data[['age', 'fare']])
return raw_data


def dump_data(data, out_loc):
"""
given a path to a datafile, either a local file path
or a url, fetch the data and dump it to a csv
"""
out_dir = os.path.join(ROOT, out_loc)
data.to_csv(out_dir, index=False)


def main():
""" Runs data processing scripts to turn raw data from (../raw) into
""" Retrieves data and runs processing scripts to turn raw data from (../raw) into
cleaned data ready to be analyzed (saved in ../processed).
"""
raw_data = pd.read_csv(DATA_LINK)
dump_data(raw_data, 'data/raw/titanic.csv')
processed_data = massage_data(raw_data)
dump_data(processed_data, 'data/processed/titanic.csv')
df = pd.read_csv(ROOT / 'data/raw/transfusion_data_raw.csv')
processed_data = massage_data(df)
processed_data.to_csv(ROOT / 'data/processed/transfusion_data.csv', index=False)


if __name__ == '__main__':
log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
logging.basicConfig(level=logging.INFO, format=log_fmt)

load_dotenv(find_dotenv())

main()
32 changes: 21 additions & 11 deletions {{ cookiecutter.repo_name }}/src/models/predict_model.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,38 @@
import os
import pickle
import logging
import pandas as pd
from pathlib import Path
from sklearn.metrics import roc_auc_score
from train_model import ROOT

ROOT = Path(__file__).resolve().parents[2]


def retrieve_model():
pickled_model = os.path.join(ROOT, 'models/titanic.model')
"""retrieve the pickled model object
"""
pickled_model = ROOT / 'models/transfusion.model'
with open(pickled_model, 'rb') as fin:
return(pickle.load(fin))


def main():
""" retrieve the model and predict labels. Show prediction and performance
"""
deserialized_model = retrieve_model()
X_test = pd.read_csv(os.path.join(ROOT,
'data/processed/titanic_x_test.csv'))
X_test = pd.read_csv(ROOT / 'data/processed/transfusion_x_test.csv')
y_pred = deserialized_model.predict(X_test)

y_test = pd.read_csv(os.path.join(ROOT,
'data/processed/titanic_y_test.csv'), header=None)
print(f'The model returned these predictions:\n{y_pred}')

y_test = pd.read_csv(ROOT / 'data/processed/transfusion_y_test.csv',
header=None)
auc = roc_auc_score(y_test.astype(int), deserialized_model.predict_proba(X_test)[:, 1])
print('AUC (area under ROC curve): ' + str(auc))
return y_pred, auc


if __name__ == '__main__':
main()
log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
logging.basicConfig(level=logging.INFO, format=log_fmt)
logger = logging.getLogger(__file__)

preds, auc = main()
logging.info('The predictions are {}'.format(preds))
logging.info('The AUC is {}'.format(auc))
34 changes: 20 additions & 14 deletions {{ cookiecutter.repo_name }}/src/models/train_model.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import os
import pickle
import pandas as pd
from pathlib import Path
Expand All @@ -14,38 +13,45 @@


def fetch_processed(data_path):
data = pd.read_csv(os.path.join(ROOT, data_path))
data_y = data.survived
data_x = data.drop(['survived', 'name', 'ticket', 'boat',
'body', 'home.dest'], axis=1)
"""
fetch the data that was processed in make data
"""
data = pd.read_csv(ROOT / data_path)
data_y = data.label
data_x = data.drop(['label'], axis=1)

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(data_x, data_y,
test_size=0.2, random_state=0)
return X_train, X_test, y_train, y_test


def fit_model(X_train, y_train):
"""
fit a model to the training data
"""
model = RandomForestClassifier(n_estimators=100)

# Fit to the training data
model.fit(X_train, y_train)
return model


def main():
x_train, x_test, y_train, y_test = fetch_processed('data/processed/titanic.csv')
""" Trains the model on the retrieved data write it back to file
"""
x_train, x_test, y_train, y_test = fetch_processed('data/processed/transfusion_data.csv')

# Train the model
model = fit_model(x_train, y_train)

# Paths for storage
model_out_dir = os.path.join(ROOT, 'models/titanic.model')
x_test_path = os.path.join(ROOT, 'data/processed/titanic_x_test.csv')
y_test_path = os.path.join(ROOT, 'data/processed/titanic_y_test.csv')

# Store model and test set for prediction
with open(model_out_dir, 'wb') as fout:
with open(ROOT / 'models/transfusion.model', 'wb') as fout:
pickle.dump(model, fout, PROTOCOL)
x_test.to_csv(x_test_path, index=False)
y_test.to_csv(y_test_path, index=False)
x_test.to_csv(ROOT / 'data/processed/transfusion_x_test.csv',
index=False)
y_test.to_csv(ROOT / 'data/processed/transfusion_y_test.csv',
index=False)


if __name__ == '__main__':
Expand Down
11 changes: 0 additions & 11 deletions {{ cookiecutter.repo_name }}/tests/conftest.py

This file was deleted.

41 changes: 12 additions & 29 deletions {{ cookiecutter.repo_name }}/tests/test_make_data.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,17 @@
import os
import pytest
import random
import pandas as pd
from pathlib import Path
from src.data.make_dataset import extract_title, dump_data
from src.data.make_dataset import massage_data

ROOT = Path(__file__).resolve().parents[2]

mock_data = {
'whether he/she donated blood in March 2007': [1, 0, 0, 1],
'Time (months)': [36, 10, 12, 16],
'Recency (months)': [10, 20, 15, 22]
}

def test_extract_title():
names = ['Mr Bob', 'Mrs Daisy', 'Sam']
expected_maps = {
'Mr Bob': 'Mr',
'Mrs Daisy': 'Mrs',
'Sam': 'Mr'
}
name = random.choice(names)
title = extract_title(name)
assert title == expected_maps.get(name)


@pytest.mark.usefixtures('tmp_dump_dir')
def test_dump_data(tmp_dump_dir, monkeypatch):
def mock_path_join(*paths):
return tmp_dump_dir
monkeypatch.setattr(os.path, 'join', mock_path_join)
df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
dump_data(df, 'foo')
dumped = pd.read_csv(tmp_dump_dir)
assert df.equals(dumped)



def test_massage_data():
raw = pd.DataFrame(mock_data)
data = massage_data(raw)
assert data.iloc[0, 2] == 10
assert data.iloc[3, 6] == 7

12 changes: 6 additions & 6 deletions {{ cookiecutter.repo_name }}/tests/test_train_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@


mock_data = {
'survived': [1, 0, 0, 1],
'name': ['John', 'Bob', 'Sam', 'Kevin'],
'ticket': ['foo', 'bar', 'buzz', 'fizz'],
'boat': ['y', 'n', 'm', 'y'],
'body': ['a', 'b', 'c', 'd'],
'home.dest': ['nyc', 'la', 'boston', 'amherst']
'label': [1, 0, 0, 1],
'fizz': ['John', 'Bob', 'Sam', 'Kevin'],
'buzz': ['foo', 'bar', 'buzz', 'fizz'],
'foo': ['y', 'n', 'm', 'y'],
'bar': ['a', 'b', 'c', 'd'],
'fish': ['nyc', 'la', 'boston', 'amherst']
}


Expand Down

0 comments on commit a0b1896

Please sign in to comment.