Dbaas 3569 (#43)

Epstein · web-flow · commit 5d434237ace7 · 2020-03-13T14:01:49.000-04:00
* handling artifact directories as zip files

* zip checking in download

* handling spaces

* fixed zip file

* back to original model saving

* file extension checking

* back to original Pipeline storage

* file ext no .
diff --git a/splicemachine/ml/management.py b/splicemachine/ml/management.py
@@ -1,13 +1,14 @@
 from builtins import super
 from collections import defaultdict
-from os import environ as env_vars, popen as rbash, system as bash
+from os import environ as env_vars, popen as rbash, system as bash, path
 from sys import getsizeof
 from time import time, sleep
 from enum import Enum
 from typing import List, Dict, Tuple
 import re
-
 import requests
+from zipfile import ZipFile
+from io import BytesIO
 from requests.auth import HTTPBasicAuth
 
 import mlflow
@@ -188,7 +189,7 @@ class MLManager(MlflowClient):
     A class for managing your MLFlow Runs/Experiments
     """
     MLMANAGER_SCHEMA = 'MLMANAGER'
-    ARTIFACT_INSERT_SQL = f'INSERT INTO {MLMANAGER_SCHEMA}.ARTIFACTS (run_uuid, name, "size", "binary") VALUES (?, ?, ?, ?)'
+    ARTIFACT_INSERT_SQL = f'INSERT INTO {MLMANAGER_SCHEMA}.ARTIFACTS (run_uuid, name, "size", "binary", file_extension) VALUES (?, ?, ?, ?, ?)'
     ARTIFACT_RETRIEVAL_SQL = 'SELECT "binary" FROM ' + f'{MLMANAGER_SCHEMA}.' + 'ARTIFACTS WHERE name=\'{name}\' ' \
                              'AND run_uuid=\'{runid}\''
     MLEAP_INSERT_SQL = f'INSERT INTO {MLMANAGER_SCHEMA}.MODELS(RUN_UUID, MODEL) VALUES (?, ?)'
@@ -471,11 +472,14 @@ def log_artifact(self, file_name, name):
         Log an artifact for the active run
         :param file_name: (str) the name of the file name to log
         :param name: (str) the name of the run relative name to store the model under
+        NOTE: We do not currently support logging directories. If you would like to log a directory, please zip it first
+              and log the zip file
         """
+        file_ext = path.splitext(file_name)[1].lstrip('.')
         with open(file_name, 'rb') as artifact:
-            byte_stream = bytearray(bytes(artifact.read()))
+           byte_stream = bytearray(bytes(artifact.read()))
 
-        self._insert_artifact(name, byte_stream)
+        self._insert_artifact(name, byte_stream, file_ext=file_ext)
 
     @check_active
     def log_artifacts(self, file_names, names):
@@ -513,8 +517,7 @@ def log_spark_model(self, model, name='model'):
         oos.writeObject(model._to_java())
         oos.flush()
         oos.close()
-
-        self._insert_artifact(name, baos.toByteArray())  # write the byte stream to the db as a BLOB
+        self._insert_artifact(name, baos.toByteArray(), file_ext='sparkmodel')  # write the byte stream to the db as a BLOB
 
     @staticmethod
     def _is_spark_model(spark_object):
@@ -536,11 +539,14 @@ def _is_spark_model(spark_object):
 
         raise Exception("The model supplied does not appear to be a Spark Model!")
 
-    def _insert_artifact(self, name, byte_array, mleap_model=False):
+    def _insert_artifact(self, name, byte_array, mleap_model=False, file_ext=None):
         """
         :param name: (str) the path to store the binary
             under (with respect to the current run)
         :param byte_array: (byte[]) Java byte array
+        :param mleap_model: (bool) whether or not the artifact is an MLeap model
+                            (We handle mleap models differently, likely to change in future releases)
+        :param file_ext: (str) the file extension of the model (used for downloading)
         """
         db_connection = self.splice_context.getConnection()
         file_size = getsizeof(byte_array)
@@ -560,6 +566,7 @@ def _insert_artifact(self, name, byte_array, mleap_model=False):
             prepared_statement.setString(2, name)
             prepared_statement.setInt(3, file_size)
             prepared_statement.setBinaryStream(4, binary_input_stream)
+            prepared_statement.setString(5,file_ext)
 
         prepared_statement.execute()
         prepared_statement.close()
@@ -748,13 +755,22 @@ def download_artifact(self, name, local_path, run_id=None):
         :param name: (str) artifact name to load
             (with respect to the run)
         :param local_path: (str) local path to download the
-            model to
+            model to. This path MUST include the file extension
         :param run_id: (str) the run id to download the artifact
             from. Defaults to active run
         """
+        file_ext = path.splitext(local_path)[1]
+        if not file_ext:
+            raise ValueError('local_path variable must contain the file extension!')
+
+        run_id = run_id or self.current_run_id
         blob_data = self.retrieve_artifact_stream(run_id, name)
-        with open(local_path, 'wb') as artifact_file:
-            artifact_file.write(blob_data)
+        if file_ext == '.zip':
+            zip_file = ZipFile(BytesIO(blob_data))
+            zip_file.extractall()
+        else:
+            with open(local_path, 'wb') as artifact_file:
+                artifact_file.write(blob_data)
 
     def load_spark_model(self, run_id=None, name='model'):
         """
@@ -763,18 +779,19 @@ def load_spark_model(self, run_id=None, name='model'):
         :param run_id: the id of the run to get a model from
             (the run must have an associated model with it named spark_model)
         """
-        if not run_id:
-            run_id = self.current_run_id
-        else:
-            run_id = self.get_run(run_id).info.run_uuid
+        run_id = run_id or self.current_run_id
 
         spark_pipeline_blob = self.retrieve_artifact_stream(run_id, name)
+        # pipeline_zip = ZipFile(BytesIO(spark_pipeline_blob))
+        # pipeline_zip.extractall()
+        # pipeline = PipelineModel.load(name)
         bis = self.splice_context.jvm.java.io.ByteArrayInputStream(spark_pipeline_blob)
         ois = self.splice_context.jvm.java.io.ObjectInputStream(bis)
         pipeline = PipelineModel._from_java(ois.readObject())  # convert object from Java
-        # PipelineModel to Python PipelineModel
+                                                               # PipelineModel to Python PipelineModel
         ois.close()
 
+
         if len(pipeline.stages) == 1 and self._is_spark_model(pipeline.stages[0]):
             pipeline = pipeline.stages[0]