PR changes

drivendataorg · Aug 7, 2018 · a0b1896 · a0b1896
1 parent c63493c
commit a0b1896
Show file tree

Hide file tree

Showing 10 changed files with 87 additions and 123 deletions.
diff --git a/cookiecutter.json b/cookiecutter.json
@@ -7,5 +7,5 @@
     "s3_bucket": "[OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')",
     "aws_profile": "default",
     "python_interpreter": ["python3", "python"],
-    "include_starter_proj": ["N", "Y"]
+    "include_starter_proj": ["n", "y"]
 }
diff --git a/hooks/post_gen_project.py b/hooks/post_gen_project.py
@@ -1,7 +1,7 @@
 import os
 import shutil
 
-DIRS = ['src/data', 'src/models']
+DIRS = ['src/models']
 CWD = os.getcwd()
 
 if '{{ cookiecutter.include_starter_proj }}' == 'N':

diff --git a/{{ cookiecutter.repo_name }}/Makefile b/{{ cookiecutter.repo_name }}/Makefile
@@ -27,8 +27,11 @@ requirements: test_environment
 
 ## Make Dataset
 data: requirements
+	wget --progress=bar:force -O data/raw/transfusion_data_raw.csv https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data
 	$(PYTHON_INTERPRETER) src/data/make_dataset.py
 
+{% if cookiecutter.include_starter_proj == 'Y' %}
+
 ## Train Model
 train: data
 	$(PYTHON_INTERPRETER) src/models/train_model.py
@@ -37,6 +40,8 @@ train: data
 model_pipeline: train
 	$(PYTHON_INTERPRETER) src/models/predict_model.py
 
+{% endif %}
+
 ## Delete all compiled Python files
 clean:
 	find . -type f -name "*.py[co]" -delete

diff --git a/{{ cookiecutter.repo_name }}/setup.py b/{{ cookiecutter.repo_name }}/setup.py
@@ -4,7 +4,6 @@
     name='src',
     packages=find_packages(),
     version='0.1.0',
-    install_requires=['{% if cookiecutter.include_starter_proj == "Y" %}scikit-learn[alldeps]{% endif %}'],
     description='{{ cookiecutter.description }}',
     author='{{ cookiecutter.author_name }}',
     license='{% if cookiecutter.open_source_license == "MIT" %}MIT{% elif cookiecutter.open_source_license == "BSD-3-Clause" %}BSD-3{% endif %}',

diff --git a/{{ cookiecutter.repo_name }}/src/data/make_dataset.py b/{{ cookiecutter.repo_name }}/src/data/make_dataset.py
@@ -1,68 +1,40 @@
-import os
+import logging
 import pandas as pd
 from pathlib import Path
-from sklearn.preprocessing import StandardScaler
+from dotenv import find_dotenv, load_dotenv
 
-DATA_LINK = 'http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv'
-TITLES = ['Mlle', 'Mrs', 'Mr', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms', 'Major', 'Col', 'Capt', 'Countess']
 ROOT = Path(__file__).resolve().parents[2]
 
 
-def extract_title(name):
-    title = 'missing'
-    for item in TITLES:
-        if item in name:
-            title = item
-            break
-    if title == 'missing':
-        title = 'Mr'
-    return title
-
-
 def massage_data(raw_data):
+    """ Preprocess the data for predictions
     """
-    preprocess the data for predictions
-    """
-    # Feature engineering ---
-    raw_data["title"] = raw_data.apply(lambda row: extract_title(row["name"]), axis=1)  
-
-    # Age: replace NaN with median
-    raw_data["age"].fillna(raw_data.age.median(), inplace=True)
-
-    # Embarked: replace NaN with the mode value
-    raw_data["embarked"].fillna(raw_data.embarked.mode()[0], inplace=True)
-
-    # Fare: replace NaN with median
-    raw_data["fare"].fillna(raw_data.fare.median(), inplace=True)
-
-    # Encode Categorical features ---
-    raw_data["cabin"] = raw_data.apply(lambda obs: "No" if pd.isnull(obs['cabin']) else "Yes", axis=1)  # binarize “cabin” feature
-    raw_data = pd.get_dummies(raw_data, columns=['sex', 'title', 'cabin', 'embarked'])
+    raw_data.rename(index=str, columns={"whether he/she donated blood in March 2007": "label"}, inplace=True)
+
+    # generate features for year for time columns
+    for x, y in zip(['time_years', 'recency_years'], ['Time (months)', 'Recency (months)']):
+        raw_data[x] = (raw_data[y] / 12).astype('int')
+
+    # generate features for quarter for time columns (3 month periods)
+    for x, y in zip(['time_quarters', 'recency_quarters'], ['Time (months)', 'Recency (months)']):
+        raw_data[x] = (raw_data[y] / 3).astype('int')
 
-    # Scaling numerical features ---
-    scale = StandardScaler().fit(raw_data[['age', 'fare']])
-    raw_data[['age', 'fare']] = scale.transform(raw_data[['age', 'fare']])
     return raw_data
 
 
-def dump_data(data, out_loc):
-    """
-    given a path to a datafile, either a local file path
-    or a url, fetch the data and dump it to a csv
-    """
-    out_dir = os.path.join(ROOT, out_loc)
-    data.to_csv(out_dir, index=False)
-
-
 def main():
-    """ Runs data processing scripts to turn raw data from (../raw) into
+    """ Retrieves data and runs processing scripts to turn raw data from (../raw) into
         cleaned data ready to be analyzed (saved in ../processed).
     """
-    raw_data = pd.read_csv(DATA_LINK)
-    dump_data(raw_data, 'data/raw/titanic.csv')
-    processed_data = massage_data(raw_data)
-    dump_data(processed_data, 'data/processed/titanic.csv')
+    df = pd.read_csv(ROOT / 'data/raw/transfusion_data_raw.csv')
+    processed_data = massage_data(df)
+    processed_data.to_csv(ROOT / 'data/processed/transfusion_data.csv', index=False)
 
 
 if __name__ == '__main__':
+    log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    logging.basicConfig(level=logging.INFO, format=log_fmt)
+
+    load_dotenv(find_dotenv())
+
     main()
diff --git a/{{ cookiecutter.repo_name }}/src/models/predict_model.py b/{{ cookiecutter.repo_name }}/src/models/predict_model.py
@@ -1,28 +1,38 @@
-import os
 import pickle
+import logging
 import pandas as pd
+from pathlib import Path
 from sklearn.metrics import roc_auc_score
-from train_model import ROOT
+
+ROOT = Path(__file__).resolve().parents[2]
 
 
 def retrieve_model():
-    pickled_model = os.path.join(ROOT, 'models/titanic.model')
+    """retrieve the pickled model object
+    """
+    pickled_model = ROOT / 'models/transfusion.model'
     with open(pickled_model, 'rb') as fin:
         return(pickle.load(fin))
 
 
 def main():
+    """ retrieve the model and predict labels. Show prediction and performance
+    """
     deserialized_model = retrieve_model()
-    X_test = pd.read_csv(os.path.join(ROOT, 
-        'data/processed/titanic_x_test.csv'))
+    X_test = pd.read_csv(ROOT / 'data/processed/transfusion_x_test.csv')
     y_pred = deserialized_model.predict(X_test)
 
-    y_test = pd.read_csv(os.path.join(ROOT,
-        'data/processed/titanic_y_test.csv'), header=None)
-    print(f'The model returned these predictions:\n{y_pred}')
-
+    y_test = pd.read_csv(ROOT / 'data/processed/transfusion_y_test.csv',
+        header=None)
     auc = roc_auc_score(y_test.astype(int), deserialized_model.predict_proba(X_test)[:, 1])
-    print('AUC (area under ROC curve): ' + str(auc))
+    return y_pred, auc
+
 
 if __name__ == '__main__':
-    main()
+    log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    logging.basicConfig(level=logging.INFO, format=log_fmt)
+    logger = logging.getLogger(__file__)
+
+    preds, auc = main()
+    logging.info('The predictions are {}'.format(preds))
+    logging.info('The AUC is {}'.format(auc))
diff --git a/{{ cookiecutter.repo_name }}/src/models/train_model.py b/{{ cookiecutter.repo_name }}/src/models/train_model.py
@@ -1,4 +1,3 @@
-import os
 import pickle
 import pandas as pd
 from pathlib import Path
@@ -14,38 +13,45 @@
 
 
 def fetch_processed(data_path):
-    data = pd.read_csv(os.path.join(ROOT, data_path))
-    data_y = data.survived
-    data_x = data.drop(['survived', 'name', 'ticket', 'boat', 
-        'body', 'home.dest'], axis=1)
+    """
+    fetch the data that was processed in make data
+    """
+    data = pd.read_csv(ROOT / data_path)
+    data_y = data.label
+    data_x = data.drop(['label'], axis=1)
+
     # Create training and test sets
     X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, 
         test_size=0.2, random_state=0)
     return X_train, X_test, y_train, y_test
 
 
 def fit_model(X_train, y_train):
+    """
+    fit a model to the training data
+    """
     model = RandomForestClassifier(n_estimators=100)
+
     # Fit to the training data
     model.fit(X_train, y_train)
     return model
 
 
 def main():
-    x_train, x_test, y_train, y_test = fetch_processed('data/processed/titanic.csv')
+    """ Trains the model on the retrieved data write it back to file
+    """
+    x_train, x_test, y_train, y_test = fetch_processed('data/processed/transfusion_data.csv')
+
     # Train the model 
     model = fit_model(x_train, y_train)
-
-    # Paths for storage
-    model_out_dir = os.path.join(ROOT, 'models/titanic.model')
-    x_test_path = os.path.join(ROOT, 'data/processed/titanic_x_test.csv')
-    y_test_path = os.path.join(ROOT, 'data/processed/titanic_y_test.csv')
 
     # Store model and test set for prediction
-    with open(model_out_dir, 'wb') as fout:
+    with open(ROOT / 'models/transfusion.model', 'wb') as fout:
         pickle.dump(model, fout, PROTOCOL)
-    x_test.to_csv(x_test_path, index=False)
-    y_test.to_csv(y_test_path, index=False)
+    x_test.to_csv(ROOT / 'data/processed/transfusion_x_test.csv',
+        index=False)
+    y_test.to_csv(ROOT / 'data/processed/transfusion_y_test.csv',
+        index=False)
 
 
 if __name__ == '__main__':

diff --git a/{{ cookiecutter.repo_name }}/tests/conftest.py b/{{ cookiecutter.repo_name }}/tests/conftest.py
diff --git a/{{ cookiecutter.repo_name }}/tests/test_make_data.py b/{{ cookiecutter.repo_name }}/tests/test_make_data.py
@@ -1,34 +1,17 @@
-import os
-import pytest
-import random
 import pandas as pd
-from pathlib import Path
-from src.data.make_dataset import extract_title, dump_data
+from src.data.make_dataset import massage_data
 
-ROOT = Path(__file__).resolve().parents[2]
 
+mock_data = {
+   'whether he/she donated blood in March 2007': [1, 0, 0, 1],
+   'Time (months)': [36, 10, 12, 16],
+   'Recency (months)': [10, 20, 15, 22] 
+}
 
-def test_extract_title():
-    names = ['Mr Bob', 'Mrs Daisy', 'Sam']
-    expected_maps = {
-        'Mr Bob': 'Mr',
-        'Mrs Daisy': 'Mrs',
-        'Sam': 'Mr'
-    }
-    name = random.choice(names)
-    title = extract_title(name)
-    assert title == expected_maps.get(name)
 
-
-@pytest.mark.usefixtures('tmp_dump_dir')
-def test_dump_data(tmp_dump_dir, monkeypatch):
-    def mock_path_join(*paths):
-        return tmp_dump_dir
-    monkeypatch.setattr(os.path, 'join', mock_path_join)
-    df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
-    dump_data(df, 'foo')
-    dumped = pd.read_csv(tmp_dump_dir)
-    assert df.equals(dumped)
-
-
-
+def test_massage_data():
+    raw = pd.DataFrame(mock_data)
+    data = massage_data(raw)
+    assert data.iloc[0, 2] == 10
+    assert data.iloc[3, 6] == 7
+
diff --git a/{{ cookiecutter.repo_name }}/tests/test_train_data.py b/{{ cookiecutter.repo_name }}/tests/test_train_data.py
@@ -5,12 +5,12 @@
 
 
 mock_data = {
-   'survived': [1, 0, 0, 1],
-   'name': ['John', 'Bob', 'Sam', 'Kevin'],
-   'ticket': ['foo', 'bar', 'buzz', 'fizz'],
-   'boat': ['y', 'n', 'm', 'y'],
-   'body': ['a', 'b', 'c', 'd'],
-   'home.dest': ['nyc', 'la', 'boston', 'amherst'] 
+   'label': [1, 0, 0, 1],
+   'fizz': ['John', 'Bob', 'Sam', 'Kevin'],
+   'buzz': ['foo', 'bar', 'buzz', 'fizz'],
+   'foo': ['y', 'n', 'm', 'y'],
+   'bar': ['a', 'b', 'c', 'd'],
+   'fish': ['nyc', 'la', 'boston', 'amherst'] 
 }