17
17
import pandas as pd
18
18
import numpy as np
19
19
import boto3
20
- import time
21
20
import re
22
21
import ast
23
22
import warnings
39
38
40
39
from io import BytesIO
41
40
from pyhive import hive
42
- from urllib .parse import urlparse
43
41
from typing import TypeVar , Optional , Dict , Any
44
42
from confluent_kafka import Consumer , Producer , TopicPartition , KafkaError
45
43
from tqdm .auto import tqdm
65
63
)
66
64
from hsfs .constructor import query
67
65
from hsfs .training_dataset_split import TrainingDatasetSplit
68
- from hsfs .client import exceptions , hopsworks
66
+ from hsfs .client import hopsworks
69
67
from hsfs .feature_group import FeatureGroup
70
68
from thrift .transport .TTransport import TTransportException
71
69
from pyhive .exc import OperationalError
@@ -384,11 +382,11 @@ def profile_by_spark(self, metadata_instance):
384
382
job = stat_api .compute (metadata_instance )
385
383
print (
386
384
"Statistics Job started successfully, you can follow the progress at \n {}" .format (
387
- self .get_job_url (job .href )
385
+ util .get_job_url (job .href )
388
386
)
389
387
)
390
388
391
- self . wait_for_job ( job )
389
+ job . _wait_for_job ( )
392
390
return job
393
391
394
392
def profile (
@@ -807,15 +805,13 @@ def write_training_dataset(
807
805
td_job = td_api .compute (training_dataset , td_app_conf )
808
806
print (
809
807
"Training dataset job started successfully, you can follow the progress at \n {}" .format (
810
- self .get_job_url (td_job .href )
808
+ util .get_job_url (td_job .href )
811
809
)
812
810
)
813
811
814
- self .wait_for_job (
815
- td_job ,
816
- await_termination = user_write_options .get ("wait_for_job" , True ),
812
+ td_job ._wait_for_job (
813
+ await_termination = user_write_options .get ("wait_for_job" , True )
817
814
)
818
-
819
815
return td_job
820
816
821
817
def _create_hive_connection (self , feature_store , hive_config = None ):
@@ -882,22 +878,6 @@ def save_empty_dataframe(self, feature_group):
882
878
"""Wrapper around save_dataframe in order to provide no-op."""
883
879
pass
884
880
885
- def get_job_url (self , href : str ):
886
- """Use the endpoint returned by the API to construct the UI url for jobs
887
-
888
- Args:
889
- href (str): the endpoint returned by the API
890
- """
891
- url = urlparse (href )
892
- url_splits = url .path .split ("/" )
893
- project_id = url_splits [4 ]
894
- job_name = url_splits [6 ]
895
- ui_url = url ._replace (
896
- path = "p/{}/jobs/named/{}/executions" .format (project_id , job_name )
897
- )
898
- ui_url = client .get_instance ().replace_public_host (ui_url )
899
- return ui_url .geturl ()
900
-
901
881
def _get_app_options (self , user_write_options = {}):
902
882
"""
903
883
Generate the options that should be passed to the application doing the ingestion.
@@ -916,31 +896,6 @@ def _get_app_options(self, user_write_options={}):
916
896
spark_job_configuration = spark_job_configuration ,
917
897
)
918
898
919
- def wait_for_job (self , job , await_termination = True ):
920
- # If the user passed the wait_for_job option consider it,
921
- # otherwise use the default True
922
- while await_termination :
923
- executions = self ._job_api .last_execution (job )
924
- if len (executions ) > 0 :
925
- execution = executions [0 ]
926
- else :
927
- return
928
-
929
- if execution .final_status .lower () == "succeeded" :
930
- return
931
- elif execution .final_status .lower () == "failed" :
932
- raise exceptions .FeatureStoreException (
933
- "The Hopsworks Job failed, use the Hopsworks UI to access the job logs"
934
- )
935
- elif execution .final_status .lower () == "killed" :
936
- raise exceptions .FeatureStoreException ("The Hopsworks Job was stopped" )
937
- elif execution .state .lower () == "framework_failure" :
938
- raise exceptions .FeatureStoreException (
939
- "The Hopsworks Job monitoring failed, could not determine the final status"
940
- )
941
-
942
- time .sleep (3 )
943
-
944
899
def add_file (self , file ):
945
900
if not file :
946
901
return file
@@ -951,7 +906,9 @@ def add_file(self, file):
951
906
952
907
local_file = os .path .join ("/tmp" , os .path .basename (file ))
953
908
if not os .path .exists (local_file ):
954
- content_stream = self ._dataset_api .read_content (file , "HIVEDB" )
909
+ content_stream = self ._dataset_api .read_content (
910
+ file , util .get_dataset_type (file )
911
+ )
955
912
bytesio_object = BytesIO (content_stream .content )
956
913
# Write the stuff
957
914
with open (local_file , "wb" ) as f :
0 commit comments