Skip to content

Commit

Permalink
Clean up pipeline graphing code (#423)
Browse files Browse the repository at this point in the history
* Rename PipelinePlots to PipelineGraph

* Replace class with methods

* Update unit tests

* Update docs

* Lint

* Define 'make_feature_importance_graph'

* Renaming.

* Rename test file

* Changelog

* Simplified path logic

* Lint

* Make graph helper functions not require pipeline

* Fix test

* Fix another test

* Protect against nonexistant paths
  • Loading branch information
dsherry authored Mar 6, 2020
1 parent 080f509 commit 91d62ec
Show file tree
Hide file tree
Showing 8 changed files with 160 additions and 140 deletions.
2 changes: 1 addition & 1 deletion docs/source/automl/search_results.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@
"metadata": {},
"outputs": [],
"source": [
"pipeline.plot.feature_importances()"
"pipeline.feature_importance_graph(pipeline)"
]
},
{
Expand Down
1 change: 1 addition & 0 deletions docs/source/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Changelog
* Added Tuner abstract base class :pr:`351`
* Added n_jobs as parameter for AutoClassificationSearch and AutoRegressionSearch :pr:`403`
* Changed colors of confusion matrix to shades of blue and updated axis order to match scikit-learn's :pr:`426`
* Added PipelineBase graph and feature_importance_graph methods, moved from previous location :pr:`423`
* Fixes
* Fixed ROC and confusion matrix plots not being calculated if user passed own additional_objectives :pr:`276`
* Fixed ReadtheDocs FileNotFoundError exception for fraud dataset :pr:`439`
Expand Down
2 changes: 1 addition & 1 deletion docs/source/index.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@
"metadata": {},
"outputs": [],
"source": [
"pipeline.plot()"
"pipeline.graph()"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion docs/source/pipelines/overview.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
"from evalml.pipelines import XGBoostPipeline\n",
"\n",
"xgp = XGBoostPipeline(objective='recall', eta=0.5, min_child_weight=5, max_depth=10, impute_strategy='mean', percent_features=0.5, number_features=10)\n",
"xgp.plot()"
"xgp.graph()"
]
},
{
Expand Down
113 changes: 113 additions & 0 deletions evalml/pipelines/graphs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import os.path

import plotly.graph_objects as go

from evalml.utils.gen_utils import import_or_raise


def make_pipeline_graph(component_list, graph_name, filepath=None):
"""Create a graph of the pipeline, in a format similar to a UML diagram.
Arguments:
pipelne (PipelineBase) : The pipeline to make a graph of.
filepath (str, optional) : Path to where the graph should be saved. If set to None (as by default), the graph will not be saved.
Returns:
graphviz.Digraph : Graph object that can directly be displayed in Jupyter notebooks.
"""
graphviz = import_or_raise('graphviz', error_msg='Please install graphviz to visualize pipelines.')

# Try rendering a dummy graph to see if a working backend is installed
try:
graphviz.Digraph().pipe()
except graphviz.backend.ExecutableNotFound:
raise RuntimeError(
"To graph entity sets, a graphviz backend is required.\n" +
"Install the backend using one of the following commands:\n" +
" Mac OS: brew install graphviz\n" +
" Linux (Ubuntu): sudo apt-get install graphviz\n" +
" Windows: conda install python-graphviz\n"
)

graph_format = None
path_and_name = None
if filepath:
# Explicitly cast to str in case a Path object was passed in
filepath = str(filepath)
try:
f = open(filepath, 'w')
f.close()
except IOError:
raise ValueError(('Specified parent directory does not exist: {}'.format(filepath)))
path_and_name, graph_format = os.path.splitext(filepath)
graph_format = graph_format[1:].lower() # ignore the dot
supported_filetypes = graphviz.backend.FORMATS
if graph_format not in supported_filetypes:
raise ValueError(("Unknown format '{}'. Make sure your format is one of the " +
"following: {}").format(graph_format, supported_filetypes))

# Initialize a new directed graph
graph = graphviz.Digraph(name=graph_name, format=graph_format,
graph_attr={'splines': 'ortho'})
graph.attr(rankdir='LR')

# Draw components
for component in component_list:
label = '%s\l' % (component.name) # noqa: W605
if len(component.parameters) > 0:
parameters = '\l'.join([key + ' : ' + "{:0.2f}".format(val) if (isinstance(val, float))
else key + ' : ' + str(val)
for key, val in component.parameters.items()]) # noqa: W605
label = '%s |%s\l' % (component.name, parameters) # noqa: W605
graph.node(component.name, shape='record', label=label)

# Draw edges
for i in range(len(component_list[:-1])):
graph.edge(component_list[i].name, component_list[i + 1].name)

if filepath:
graph.render(path_and_name, cleanup=True)

return graph


def make_feature_importance_graph(feature_importances, show_all_features=False):
"""Create and return a bar graph of the pipeline's feature importances
Arguments:
feature_importances (pd.DataFrame) : The pipeline with which to compute feature importances.
show_all_features (bool, optional) : If true, graph features with an importance value of zero. Defaults to false.
Returns:
plotly.Figure, a bar graph showing features and their importances
"""
feat_imp = feature_importances
feat_imp['importance'] = abs(feat_imp['importance'])

if not show_all_features:
# Remove features with zero importance
feat_imp = feat_imp[feat_imp['importance'] != 0]

# List is reversed to go from ascending order to descending order
feat_imp = feat_imp.iloc[::-1]

title = 'Feature Importances'
subtitle = 'May display fewer features due to feature selection'
data = [go.Bar(
x=feat_imp['importance'],
y=feat_imp['feature'],
orientation='h'
)]

layout = {
'title': '{0}<br><sub>{1}</sub>'.format(title, subtitle),
'height': 800,
'xaxis_title': 'Feature Importance',
'yaxis_title': 'Feature',
'yaxis': {
'type': 'category'
}
}

fig = go.Figure(data=data, layout=layout)
return fig
29 changes: 23 additions & 6 deletions evalml/pipelines/pipeline_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,13 @@
from sklearn.model_selection import train_test_split

from .components import Estimator, handle_component
from .pipeline_plots import PipelinePlots
from .graphs import make_feature_importance_graph, make_pipeline_graph

from evalml.objectives import get_objective
from evalml.utils import Logger


class PipelineBase:

# Necessary for "Plotting" documentation, since Sphinx does not work well with instance attributes.
plot = PipelinePlots

def __init__(self, objective, component_list, n_jobs, random_state):
"""Machine learning pipeline made out of transformers and a estimator.
Expand Down Expand Up @@ -53,7 +49,6 @@ def __init__(self, objective, component_list, n_jobs, random_state):
for component in self.component_list:
self.parameters.update(component.parameters)

self.plot = PipelinePlots(self)
self.logger = Logger()

def __getitem__(self, index):
Expand Down Expand Up @@ -263,6 +258,17 @@ def score(self, X, y, other_objectives=None):

return scores[0], other_scores

def graph(self, filepath=None):
"""Generate an image representing the pipeline graph
Arguments:
filepath (str, optional) : Path to where the graph should be saved. If set to None (as by default), the graph will not be saved.
Returns:
graphviz.Digraph: Graph object that can be directly displayed in Jupyter notebooks.
"""
return make_pipeline_graph(self.component_list, self.name, filepath=filepath)

@property
def feature_importances(self):
"""Return feature importances. Features dropped by feature selection are excluded"""
Expand All @@ -271,3 +277,14 @@ def feature_importances(self):
importances.sort(key=lambda x: -abs(x[1]))
df = pd.DataFrame(importances, columns=["feature", "importance"])
return df

def feature_importance_graph(self, show_all_features=False):
"""Generate a bar graph of the pipeline's feature importances
Arguments:
show_all_features (bool, optional) : If true, graph features with an importance value of zero. Defaults to false.
Returns:
plotly.Figure, a bar graph showing features and their importances
"""
return make_feature_importance_graph(self.feature_importances, show_all_features=show_all_features)
118 changes: 0 additions & 118 deletions evalml/pipelines/pipeline_plots.py

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -11,36 +11,43 @@

def test_returns_digraph_object():
clf = PipelineBase('precision', component_list=['Simple Imputer', 'One Hot Encoder', 'Standard Scaler', 'Logistic Regression Classifier'], n_jobs=-1, random_state=0)
plot = clf.plot()
assert isinstance(plot, graphviz.Digraph)
graph = clf.graph()
assert isinstance(graph, graphviz.Digraph)


def test_saving_png_file(tmpdir):
path = os.path.join(str(tmpdir), 'pipeline.png')
filepath = os.path.join(str(tmpdir), 'pipeline.png')
pipeline = PipelineBase('precision', component_list=['Simple Imputer', 'One Hot Encoder', 'Standard Scaler', 'Logistic Regression Classifier'], n_jobs=-1, random_state=0)
pipeline.plot(to_file=path)
assert os.path.isfile(path)
pipeline.graph(filepath=filepath)
assert os.path.isfile(filepath)


def test_missing_file_extension():
path = "test1"
filepath = "test1"
pipeline = PipelineBase('precision', component_list=['Simple Imputer', 'One Hot Encoder', 'Standard Scaler', 'Logistic Regression Classifier'], n_jobs=-1, random_state=0)
with pytest.raises(ValueError, match="Please use a file extension"):
pipeline.plot(to_file=path)
with pytest.raises(ValueError, match="Unknown format"):
pipeline.graph(filepath=filepath)


def test_invalid_format():
path = "test1.xzy"
filepath = "test1.xzy"
pipeline = PipelineBase('precision', component_list=['Simple Imputer', 'One Hot Encoder', 'Standard Scaler', 'Logistic Regression Classifier'], n_jobs=-1, random_state=0)
with pytest.raises(ValueError, match="Unknown format"):
pipeline.plot(to_file=path)
pipeline.graph(filepath=filepath)


def test_invalid_path(tmpdir):
filepath = os.path.join(str(tmpdir), 'invalid', 'path', 'pipeline.png')
pipeline = PipelineBase('precision', component_list=['Simple Imputer', 'One Hot Encoder', 'Standard Scaler', 'Logistic Regression Classifier'], n_jobs=-1, random_state=0)
with pytest.raises(ValueError, match="Specified parent directory does not exist"):
pipeline.graph(filepath=filepath)


def test_feature_importance_plot(X_y):
X, y = X_y
clf = PipelineBase('precision', component_list=['Simple Imputer', 'One Hot Encoder', 'Standard Scaler', 'Logistic Regression Classifier'], n_jobs=-1, random_state=0)
clf.fit(X, y)
assert isinstance(clf.plot.feature_importances(), go.Figure)
assert isinstance(clf.feature_importance_graph(), go.Figure)


def test_feature_importance_plot_show_all_features(X_y):
Expand All @@ -66,12 +73,12 @@ def feature_importances(self):
X, y = X_y
clf = MockPipeline()
clf.fit(X, y)
figure = clf.plot.feature_importances()
figure = clf.feature_importance_graph()
assert isinstance(figure, go.Figure)

data = figure.data[0]
assert (np.all(data['x']))

figure = clf.plot.feature_importances(show_all_features=True)
figure = clf.feature_importance_graph(show_all_features=True)
data = figure.data[0]
assert (np.any(data['x'] == 0.0))

0 comments on commit 91d62ec

Please sign in to comment.