Clean up pipeline graphing code (#423)

* Rename PipelinePlots to PipelineGraph * Replace class with methods * Update unit tests * Update docs * Lint * Define 'make_feature_importance_graph' * Renaming. * Rename test file * Changelog * Simplified path logic * Lint * Make graph helper functions not require pipeline * Fix test * Fix another test * Protect against nonexistant paths
alteryx · Mar 6, 2020 · 91d62ec · 91d62ec
1 parent 080f509
commit 91d62ec
Show file tree

Hide file tree

Showing 8 changed files with 160 additions and 140 deletions.
diff --git a/docs/source/automl/search_results.ipynb b/docs/source/automl/search_results.ipynb
@@ -126,7 +126,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "pipeline.plot.feature_importances()"
+    "pipeline.feature_importance_graph(pipeline)"
    ]
   },
   {

diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -9,6 +9,7 @@ Changelog
         * Added Tuner abstract base class :pr:`351`
         * Added n_jobs as parameter for AutoClassificationSearch and AutoRegressionSearch :pr:`403`
         * Changed colors of confusion matrix to shades of blue and updated axis order to match scikit-learn's :pr:`426`
+        * Added PipelineBase graph and feature_importance_graph methods, moved from previous location :pr:`423`
     * Fixes
         * Fixed ROC and confusion matrix plots not being calculated if user passed own additional_objectives :pr:`276`
         * Fixed ReadtheDocs FileNotFoundError exception for fraud dataset :pr:`439`

diff --git a/docs/source/index.ipynb b/docs/source/index.ipynb
@@ -194,7 +194,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "pipeline.plot()"
+    "pipeline.graph()"
    ]
   },
   {

diff --git a/docs/source/pipelines/overview.ipynb b/docs/source/pipelines/overview.ipynb
@@ -32,7 +32,7 @@
     "from evalml.pipelines import XGBoostPipeline\n",
     "\n",
     "xgp = XGBoostPipeline(objective='recall', eta=0.5, min_child_weight=5, max_depth=10, impute_strategy='mean', percent_features=0.5, number_features=10)\n",
-    "xgp.plot()"
+    "xgp.graph()"
    ]
   },
   {

diff --git a/evalml/pipelines/graphs.py b/evalml/pipelines/graphs.py
@@ -0,0 +1,113 @@
+import os.path
+
+import plotly.graph_objects as go
+
+from evalml.utils.gen_utils import import_or_raise
+
+
+def make_pipeline_graph(component_list, graph_name, filepath=None):
+    """Create a graph of the pipeline, in a format similar to a UML diagram.
+
+    Arguments:
+        pipelne (PipelineBase) : The pipeline to make a graph of.
+        filepath (str, optional) : Path to where the graph should be saved. If set to None (as by default), the graph will not be saved.
+
+    Returns:
+        graphviz.Digraph : Graph object that can directly be displayed in Jupyter notebooks.
+    """
+    graphviz = import_or_raise('graphviz', error_msg='Please install graphviz to visualize pipelines.')
+
+    # Try rendering a dummy graph to see if a working backend is installed
+    try:
+        graphviz.Digraph().pipe()
+    except graphviz.backend.ExecutableNotFound:
+        raise RuntimeError(
+            "To graph entity sets, a graphviz backend is required.\n" +
+            "Install the backend using one of the following commands:\n" +
+            "  Mac OS: brew install graphviz\n" +
+            "  Linux (Ubuntu): sudo apt-get install graphviz\n" +
+            "  Windows: conda install python-graphviz\n"
+        )
+
+    graph_format = None
+    path_and_name = None
+    if filepath:
+        # Explicitly cast to str in case a Path object was passed in
+        filepath = str(filepath)
+        try:
+            f = open(filepath, 'w')
+            f.close()
+        except IOError:
+            raise ValueError(('Specified parent directory does not exist: {}'.format(filepath)))
+        path_and_name, graph_format = os.path.splitext(filepath)
+        graph_format = graph_format[1:].lower()  # ignore the dot
+        supported_filetypes = graphviz.backend.FORMATS
+        if graph_format not in supported_filetypes:
+            raise ValueError(("Unknown format '{}'. Make sure your format is one of the " +
+                              "following: {}").format(graph_format, supported_filetypes))
+
+    # Initialize a new directed graph
+    graph = graphviz.Digraph(name=graph_name, format=graph_format,
+                             graph_attr={'splines': 'ortho'})
+    graph.attr(rankdir='LR')
+
+    # Draw components
+    for component in component_list:
+        label = '%s\l' % (component.name)  # noqa: W605
+        if len(component.parameters) > 0:
+            parameters = '\l'.join([key + ' : ' + "{:0.2f}".format(val) if (isinstance(val, float))
+                                    else key + ' : ' + str(val)
+                                    for key, val in component.parameters.items()])  # noqa: W605
+            label = '%s |%s\l' % (component.name, parameters)  # noqa: W605
+        graph.node(component.name, shape='record', label=label)
+
+    # Draw edges
+    for i in range(len(component_list[:-1])):
+        graph.edge(component_list[i].name, component_list[i + 1].name)
+
+    if filepath:
+        graph.render(path_and_name, cleanup=True)
+
+    return graph
+
+
+def make_feature_importance_graph(feature_importances, show_all_features=False):
+    """Create and return a bar graph of the pipeline's feature importances
+
+    Arguments:
+        feature_importances (pd.DataFrame) : The pipeline with which to compute feature importances.
+        show_all_features (bool, optional) : If true, graph features with an importance value of zero. Defaults to false.
+
+    Returns:
+        plotly.Figure, a bar graph showing features and their importances
+    """
+    feat_imp = feature_importances
+    feat_imp['importance'] = abs(feat_imp['importance'])
+
+    if not show_all_features:
+        # Remove features with zero importance
+        feat_imp = feat_imp[feat_imp['importance'] != 0]
+
+    # List is reversed to go from ascending order to descending order
+    feat_imp = feat_imp.iloc[::-1]
+
+    title = 'Feature Importances'
+    subtitle = 'May display fewer features due to feature selection'
+    data = [go.Bar(
+        x=feat_imp['importance'],
+        y=feat_imp['feature'],
+        orientation='h'
+    )]
+
+    layout = {
+        'title': '{0}<br><sub>{1}</sub>'.format(title, subtitle),
+        'height': 800,
+        'xaxis_title': 'Feature Importance',
+        'yaxis_title': 'Feature',
+        'yaxis': {
+            'type': 'category'
+        }
+    }
+
+    fig = go.Figure(data=data, layout=layout)
+    return fig
diff --git a/evalml/pipelines/pipeline_base.py b/evalml/pipelines/pipeline_base.py
@@ -4,17 +4,13 @@
 from sklearn.model_selection import train_test_split
 
 from .components import Estimator, handle_component
-from .pipeline_plots import PipelinePlots
+from .graphs import make_feature_importance_graph, make_pipeline_graph
 
 from evalml.objectives import get_objective
 from evalml.utils import Logger
 
 
 class PipelineBase:
-
-    # Necessary for "Plotting" documentation, since Sphinx does not work well with instance attributes.
-    plot = PipelinePlots
-
     def __init__(self, objective, component_list, n_jobs, random_state):
         """Machine learning pipeline made out of transformers and a estimator.
 
@@ -53,7 +49,6 @@ def __init__(self, objective, component_list, n_jobs, random_state):
         for component in self.component_list:
             self.parameters.update(component.parameters)
 
-        self.plot = PipelinePlots(self)
         self.logger = Logger()
 
     def __getitem__(self, index):
@@ -263,6 +258,17 @@ def score(self, X, y, other_objectives=None):
 
         return scores[0], other_scores
 
+    def graph(self, filepath=None):
+        """Generate an image representing the pipeline graph
+
+        Arguments:
+            filepath (str, optional) : Path to where the graph should be saved. If set to None (as by default), the graph will not be saved.
+
+        Returns:
+            graphviz.Digraph: Graph object that can be directly displayed in Jupyter notebooks.
+        """
+        return make_pipeline_graph(self.component_list, self.name, filepath=filepath)
+
     @property
     def feature_importances(self):
         """Return feature importances. Features dropped by feature selection are excluded"""
@@ -271,3 +277,14 @@ def feature_importances(self):
         importances.sort(key=lambda x: -abs(x[1]))
         df = pd.DataFrame(importances, columns=["feature", "importance"])
         return df
+
+    def feature_importance_graph(self, show_all_features=False):
+        """Generate a bar graph of the pipeline's feature importances
+
+        Arguments:
+            show_all_features (bool, optional) : If true, graph features with an importance value of zero. Defaults to false.
+
+        Returns:
+            plotly.Figure, a bar graph showing features and their importances
+        """
+        return make_feature_importance_graph(self.feature_importances, show_all_features=show_all_features)
diff --git a/evalml/pipelines/pipeline_plots.py b/evalml/pipelines/pipeline_plots.py
diff --git a/...ts/pipeline_tests/test_pipelines_plots.py → evalml/tests/pipeline_tests/test_graphs.py b/...ts/pipeline_tests/test_pipelines_plots.py → evalml/tests/pipeline_tests/test_graphs.py
@@ -11,36 +11,43 @@
 
 def test_returns_digraph_object():
     clf = PipelineBase('precision', component_list=['Simple Imputer', 'One Hot Encoder', 'Standard Scaler', 'Logistic Regression Classifier'], n_jobs=-1, random_state=0)
-    plot = clf.plot()
-    assert isinstance(plot, graphviz.Digraph)
+    graph = clf.graph()
+    assert isinstance(graph, graphviz.Digraph)
 
 
 def test_saving_png_file(tmpdir):
-    path = os.path.join(str(tmpdir), 'pipeline.png')
+    filepath = os.path.join(str(tmpdir), 'pipeline.png')
     pipeline = PipelineBase('precision', component_list=['Simple Imputer', 'One Hot Encoder', 'Standard Scaler', 'Logistic Regression Classifier'], n_jobs=-1, random_state=0)
-    pipeline.plot(to_file=path)
-    assert os.path.isfile(path)
+    pipeline.graph(filepath=filepath)
+    assert os.path.isfile(filepath)
 
 
 def test_missing_file_extension():
-    path = "test1"
+    filepath = "test1"
     pipeline = PipelineBase('precision', component_list=['Simple Imputer', 'One Hot Encoder', 'Standard Scaler', 'Logistic Regression Classifier'], n_jobs=-1, random_state=0)
-    with pytest.raises(ValueError, match="Please use a file extension"):
-        pipeline.plot(to_file=path)
+    with pytest.raises(ValueError, match="Unknown format"):
+        pipeline.graph(filepath=filepath)
 
 
 def test_invalid_format():
-    path = "test1.xzy"
+    filepath = "test1.xzy"
     pipeline = PipelineBase('precision', component_list=['Simple Imputer', 'One Hot Encoder', 'Standard Scaler', 'Logistic Regression Classifier'], n_jobs=-1, random_state=0)
     with pytest.raises(ValueError, match="Unknown format"):
-        pipeline.plot(to_file=path)
+        pipeline.graph(filepath=filepath)
+
+
+def test_invalid_path(tmpdir):
+    filepath = os.path.join(str(tmpdir), 'invalid', 'path', 'pipeline.png')
+    pipeline = PipelineBase('precision', component_list=['Simple Imputer', 'One Hot Encoder', 'Standard Scaler', 'Logistic Regression Classifier'], n_jobs=-1, random_state=0)
+    with pytest.raises(ValueError, match="Specified parent directory does not exist"):
+        pipeline.graph(filepath=filepath)
 
 
 def test_feature_importance_plot(X_y):
     X, y = X_y
     clf = PipelineBase('precision', component_list=['Simple Imputer', 'One Hot Encoder', 'Standard Scaler', 'Logistic Regression Classifier'], n_jobs=-1, random_state=0)
     clf.fit(X, y)
-    assert isinstance(clf.plot.feature_importances(), go.Figure)
+    assert isinstance(clf.feature_importance_graph(), go.Figure)
 
 
 def test_feature_importance_plot_show_all_features(X_y):
@@ -66,12 +73,12 @@ def feature_importances(self):
     X, y = X_y
     clf = MockPipeline()
     clf.fit(X, y)
-    figure = clf.plot.feature_importances()
+    figure = clf.feature_importance_graph()
     assert isinstance(figure, go.Figure)
 
     data = figure.data[0]
     assert (np.all(data['x']))
 
-    figure = clf.plot.feature_importances(show_all_features=True)
+    figure = clf.feature_importance_graph(show_all_features=True)
     data = figure.data[0]
     assert (np.any(data['x'] == 0.0))