Skip to content

Commit

Permalink
Project import generated by Copybara. (#18)
Browse files Browse the repository at this point in the history
GitOrigin-RevId: 288c0c4da10ce230b81b6eb80316011cbb76252b

Co-authored-by: Snowflake Authors <[email protected]>
  • Loading branch information
snowflake-provisioner and Snowflake Authors authored Jun 2, 2023
1 parent 86d00b9 commit 69a4f0d
Show file tree
Hide file tree
Showing 183 changed files with 5,601 additions and 891 deletions.
2 changes: 1 addition & 1 deletion ci/conda_recipe/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ requirements:
- cloudpickle
- fsspec>=2022.11,<=2023.1
- numpy>=1.23,<2
- packaging>=23.0,<24
- packaging>=20.9,<24
- pandas>=1.0.0,<2 # Limit since 2.x is not available in Snowflake Anaconda Channel yet.
- pyyaml>=6.0,<7
- scikit-learn>=1.2.1,<2
Expand Down
32 changes: 30 additions & 2 deletions ci/type_ignored_targets
Original file line number Diff line number Diff line change
@@ -1,4 +1,32 @@
//snowflake/ml/experimental/...
//snowflake/ml/modeling/...
//tests/integ/snowflake/ml/_internal/...
//tests/integ/snowflake/ml/extra_tests/...
//tests/integ/snowflake/ml/preprocessing/...
//tests/integ/snowflake/ml/sklearn/preprocessing/...

//snowflake/ml/sklearn/linear_model/...
//snowflake/ml/sklearn/ensemble/...
//snowflake/ml/sklearn/svm/...
//snowflake/ml/sklearn/neural_network/...
//snowflake/ml/sklearn/tree/...
//snowflake/ml/sklearn/calibration/...
//snowflake/ml/sklearn/cluster/...
//snowflake/ml/sklearn/compose/...
//snowflake/ml/sklearn/covariance/...
//snowflake/ml/sklearn/decomposition/...
//snowflake/ml/sklearn/discriminant_analysis/...
//snowflake/ml/sklearn/feature_selection/...
//snowflake/ml/sklearn/gaussian_process/...
//snowflake/ml/sklearn/impute/...
//snowflake/ml/sklearn/isotonic/...
//snowflake/ml/sklearn/kernel_approximation/...
//snowflake/ml/sklearn/kernel_ridge/...
//snowflake/ml/sklearn/manifold/...
//snowflake/ml/sklearn/mixture/...
//snowflake/ml/sklearn/model_selection/...
//snowflake/ml/sklearn/multiclass/...
//snowflake/ml/sklearn/multioutput/...
//snowflake/ml/sklearn/naive_bayes/...
//snowflake/ml/sklearn/neighbors/...
//snowflake/ml/sklearn/semi_supervised/...
//snowflake/ml/xgboost/...
//snowflake/ml/lightgbm/...
3 changes: 2 additions & 1 deletion codegen/codegen_rules.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -82,12 +82,13 @@ def autogen_estimators(module, estimator_info_list):
srcs = [":generate_{}".format(e.normalized_class_name)],
deps = [
":init",
"//snowflake/ml/framework:framework",
"//snowflake/ml/sklearn/framework:framework",
"//snowflake/ml/_internal:telemetry",
"//snowflake/ml/_internal/utils:temp_file_utils",
"//snowflake/ml/_internal/utils:query_result_checker",
"//snowflake/ml/_internal/utils:pkg_version_utils",
"//snowflake/ml/_internal/utils:identifier",
"//snowflake/ml/model:model_signature",
],
tags = ["skip_mypy_check"],
)
Expand Down
4 changes: 2 additions & 2 deletions codegen/sklearn_wrapper_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,9 +311,9 @@ def get_snow_ml_module_name(module_name: str) -> str:
"""
tokens = module_name.split(".")
if tokens[0] == "sklearn":
return "snowflake.ml.modeling." + ".".join(module_name.split(".")[1:])
return "snowflake.ml.sklearn." + ".".join(module_name.split(".")[1:])
else:
return "snowflake.ml.modeling." + module_name
return "snowflake.ml." + module_name

@staticmethod
def can_generate_wrapper(class_object: Tuple[str, type]) -> bool:
Expand Down
91 changes: 81 additions & 10 deletions codegen/sklearn_wrapper_template.py_template
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import numpy as np
{transform.estimator_imports}
from sklearn.utils.metaestimators import available_if

from snowflake.ml.framework.base import BaseTransformer
from snowflake.ml.sklearn.framework.base import BaseTransformer
from snowflake.ml._internal import telemetry
from snowflake.ml._internal.utils.query_result_checker import SqlResultValidator
from snowflake.ml._internal.utils import pkg_version_utils, identifier
Expand All @@ -21,6 +21,14 @@ from snowflake.snowpark import DataFrame, Session
from snowflake.snowpark.functions import pandas_udf, sproc
from snowflake.snowpark.types import PandasSeries

from snowflake.ml.model.model_signature import (
DataType,
FeatureSpec,
ModelSignature,
_infer_signature,
_rename_features,
)

_PROJECT = "ModelDevelopment"
# Derive subproject from module name by removing "sklearn"
# and converting module name from underscore to CamelCase
Expand Down Expand Up @@ -116,6 +124,7 @@ class {transform.original_class_name}(BaseTransformer):
self._sklearn_object = {transform.root_module_name}.{transform.original_class_name}(
{transform.sklearn_init_arguments}
)
self._model_signature_dict = None
{transform.estimator_init_member_args}

def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
Expand Down Expand Up @@ -161,6 +170,7 @@ class {transform.original_class_name}(BaseTransformer):
"Supported dataset types: snowpark.DataFrame, pandas.DataFrame."
)
self._is_fitted = True
self._get_model_signatures(dataset)
return self

def _fit_snowpark(self, dataset: DataFrame) -> None:
Expand Down Expand Up @@ -310,9 +320,9 @@ class {transform.original_class_name}(BaseTransformer):
query,
stage_transform_file_name,
stage_result_file_name,
identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.input_cols),
identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.label_cols),
identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.sample_weight_col),
identifier.get_unescaped_names(self.input_cols),
identifier.get_unescaped_names(self.label_cols),
identifier.get_unescaped_names(self.sample_weight_col),
statement_params=statement_params,
)

Expand Down Expand Up @@ -378,7 +388,7 @@ class {transform.original_class_name}(BaseTransformer):
# Input columns for UDF are sorted by column names.
# We need actual order of input cols to reorder dataframe before calling inference methods.
input_cols = self.input_cols
unquoted_input_cols = identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.input_cols)
unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)

statement_params = telemetry.get_function_usage_statement_params(
project=_PROJECT,
Expand Down Expand Up @@ -511,9 +521,37 @@ class {transform.original_class_name}(BaseTransformer):
expected_output_cols_list: List[str]
) -> pd.DataFrame:
output_cols = expected_output_cols_list.copy()
transformed_numpy_array = getattr(self._sklearn_object, inference_method)(
dataset[self.input_cols]

# Model expects exact same columns names in the input df for predict call.
# Given the scenario that user use snowpark DataFrame in fit call, but pandas DataFrame in predict call
# input cols need to match unquoted / quoted
input_cols = self.input_cols
unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)

estimator = self._sklearn_object

input_df = dataset[input_cols] # Select input columns with quoted column names.
if hasattr(estimator, "feature_names_in_"):
missing_features = []
for i, f in enumerate(getattr(estimator, "feature_names_in_")):
if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
missing_features.append(f)

if len(missing_features) > 0:
raise ValueError(
"The feature names should match with those that were passed during fit.\n"
f"Features seen during fit call but not present in the input: {{missing_features}}\n"
f"Features in the input dataframe : {{input_cols}}\n"
)
input_df.columns = getattr(estimator, "feature_names_in_")
else:
# Just rename the column names to unquoted identifiers.
input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.

transformed_numpy_array = getattr(estimator, inference_method)(
input_df
)

if (
isinstance(transformed_numpy_array, list)
and len(transformed_numpy_array) > 0
Expand Down Expand Up @@ -974,12 +1012,45 @@ class {transform.original_class_name}(BaseTransformer):
score_sproc_name,
query,
stage_score_file_name,
identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.input_cols),
identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.label_cols),
identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.sample_weight_col),
identifier.get_unescaped_names(self.input_cols),
identifier.get_unescaped_names(self.label_cols),
identifier.get_unescaped_names(self.sample_weight_col),
statement_params=statement_params,
)

cleanup_temp_files([local_score_file_name])

return score

def _get_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
self._model_signature_dict: Dict[str, ModelSignature] = dict()

PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]

inputs = _infer_signature(dataset[self.input_cols], "input")
if hasattr(self, "predict"):
# For classifier, the type of predict is the same as the type of label
if self._sklearn_object._estimator_type == 'classifier':
outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
outputs = _rename_features(outputs, self.output_cols) # rename the output columns
self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
# For regressor, the type of predict is float64
elif self._sklearn_object._estimator_type == 'regressor':
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)

for prob_func in PROB_FUNCTIONS:
if hasattr(self, prob_func):
output_cols_prefix: str = f"{{prob_func}}_"
output_column_names = self._get_output_column_names(output_cols_prefix)
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)

##TODO: Add support for transform method


@property
def model_signatures(self) -> Dict[str, ModelSignature]:
if self._model_signature_dict is None:
raise RuntimeError("Estimator not fitted before accessing property model_signatures! ")
return self._model_signature_dict
1 change: 0 additions & 1 deletion conda-env-extended.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
channels:
- conda-forge
dependencies:
- moto==4.0.11 # SNOW-690705
- torchdata==0.4.1 # SNOW-702102
# SNOW-747683: Tensorflow is available on snowflake conda channel,
# however, macos-arm64 is only available on conda-forge.
Expand Down
4 changes: 3 additions & 1 deletion conda-env-snowflake.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,14 @@ dependencies:
- boto3==1.24.28
- conda-libmamba-solver==23.1.0
- coverage==6.3.2 # not a package dependency.
- docker-py==4.4.1
- flask-cors==3.0.10
- flask==2.1.3
- fsspec==2022.10.0
- inflection==0.5.1
- joblib==1.1.1
- lightgbm==3.3.5
- moto==4.0.11
- networkx==2.8.4
- numpy==1.23.4
- packaging==23.0
Expand All @@ -38,4 +40,4 @@ dependencies:
- sqlparse==0.4.3
- typing-extensions==4.5.0
- xgboost==1.7.3
- mypy==0.981 # not a package dependency.
- mypy==0.981 # not a package dependency.
1 change: 1 addition & 0 deletions conda-env.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ dependencies:
- boto3==1.24.28
- conda-libmamba-solver==23.1.0
- coverage==6.3.2
- docker-py==4.4.1
- flask-cors==3.0.10
- flask==2.1.3
- fsspec==2022.10.0
Expand Down
58 changes: 29 additions & 29 deletions snowflake/ml/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ snowml_wheel(
"cloudpickle", # Version range is specified by snowpark. We are implicitly depending on it.
"fsspec[http]>=2022.11,<=2023.1",
"numpy>=1.23,<2",
"packaging>=23.0,<24",
"packaging>=20.9,<24",
"pandas>=1.0.0,<2", # Limit since 2.x is not available in Snowflake Anaconda Channel yet.
"pyyaml>=6.0,<7",
"scikit-learn>=1.2.1,<2",
Expand All @@ -55,37 +55,37 @@ snowml_wheel(
version = VERSION,
deps = [
"//snowflake/ml/metrics:metrics_pkg",
"//snowflake/ml/preprocessing:preprocessing_pkg",
"//snowflake/ml/sklearn/preprocessing:preprocessing_pkg",
"//snowflake/ml/utils:utils_pkg",
"//snowflake/ml/fileset:fileset_pkg",
"//snowflake/ml/registry:model_registry_pkg",
# Auotgen packages
"//snowflake/ml/modeling/linear_model:sklearn_linear_model_pkg",
"//snowflake/ml/modeling/ensemble:sklearn_ensemble_pkg",
"//snowflake/ml/modeling/svm:sklearn_svm_pkg",
"//snowflake/ml/modeling/neural_network:sklearn_neural_network_pkg",
"//snowflake/ml/modeling/tree:sklearn_tree_pkg",
"//snowflake/ml/modeling/xgboost:xgboost_pkg",
"//snowflake/ml/modeling/calibration:sklearn_calibration_pkg",
"//snowflake/ml/modeling/cluster:sklearn_cluster_pkg",
"//snowflake/ml/modeling/compose:sklearn_compose_pkg",
"//snowflake/ml/modeling/covariance:sklearn_covariance_pkg",
"//snowflake/ml/modeling/decomposition:sklearn_decomposition_pkg",
"//snowflake/ml/modeling/discriminant_analysis:sklearn_discriminant_analysis_pkg",
"//snowflake/ml/modeling/feature_selection:sklearn_feature_selection_pkg",
"//snowflake/ml/modeling/gaussian_process:sklearn_gaussian_process_pkg",
"//snowflake/ml/modeling/impute:sklearn_impute_pkg",
"//snowflake/ml/modeling/isotonic:sklearn_isotonic_pkg",
"//snowflake/ml/modeling/kernel_approximation:sklearn_kernel_approximation_pkg",
"//snowflake/ml/modeling/kernel_ridge:sklearn_kernel_ridge_pkg",
"//snowflake/ml/modeling/manifold:sklearn_manifold_pkg",
"//snowflake/ml/modeling/mixture:sklearn_mixture_pkg",
"//snowflake/ml/modeling/model_selection:sklearn_model_selection_pkg",
"//snowflake/ml/modeling/multiclass:sklearn_multiclass_pkg",
"//snowflake/ml/modeling/multioutput:sklearn_multioutput_pkg",
"//snowflake/ml/modeling/naive_bayes:sklearn_naive_bayes_pkg",
"//snowflake/ml/modeling/neighbors:sklearn_neighbors_pkg",
"//snowflake/ml/modeling/semi_supervised:sklearn_semi_supervised_pkg",
"//snowflake/ml/modeling/lightgbm:lightgbm_pkg",
"//snowflake/ml/sklearn/linear_model:sklearn_linear_model_pkg",
"//snowflake/ml/sklearn/ensemble:sklearn_ensemble_pkg",
"//snowflake/ml/sklearn/svm:sklearn_svm_pkg",
"//snowflake/ml/sklearn/neural_network:sklearn_neural_network_pkg",
"//snowflake/ml/sklearn/tree:sklearn_tree_pkg",
"//snowflake/ml/sklearn/calibration:sklearn_calibration_pkg",
"//snowflake/ml/sklearn/cluster:sklearn_cluster_pkg",
"//snowflake/ml/sklearn/compose:sklearn_compose_pkg",
"//snowflake/ml/sklearn/covariance:sklearn_covariance_pkg",
"//snowflake/ml/sklearn/decomposition:sklearn_decomposition_pkg",
"//snowflake/ml/sklearn/discriminant_analysis:sklearn_discriminant_analysis_pkg",
"//snowflake/ml/sklearn/feature_selection:sklearn_feature_selection_pkg",
"//snowflake/ml/sklearn/gaussian_process:sklearn_gaussian_process_pkg",
"//snowflake/ml/sklearn/impute:sklearn_impute_pkg",
"//snowflake/ml/sklearn/isotonic:sklearn_isotonic_pkg",
"//snowflake/ml/sklearn/kernel_approximation:sklearn_kernel_approximation_pkg",
"//snowflake/ml/sklearn/kernel_ridge:sklearn_kernel_ridge_pkg",
"//snowflake/ml/sklearn/manifold:sklearn_manifold_pkg",
"//snowflake/ml/sklearn/mixture:sklearn_mixture_pkg",
"//snowflake/ml/sklearn/model_selection:sklearn_model_selection_pkg",
"//snowflake/ml/sklearn/multiclass:sklearn_multiclass_pkg",
"//snowflake/ml/sklearn/multioutput:sklearn_multioutput_pkg",
"//snowflake/ml/sklearn/naive_bayes:sklearn_naive_bayes_pkg",
"//snowflake/ml/sklearn/neighbors:sklearn_neighbors_pkg",
"//snowflake/ml/sklearn/semi_supervised:sklearn_semi_supervised_pkg",
"//snowflake/ml/xgboost:xgboost_pkg",
"//snowflake/ml/lightgbm:lightgbm_pkg",
],
)
1 change: 0 additions & 1 deletion snowflake/ml/_internal/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ py_library(
py_test(
name = "file_utils_test",
srcs = ["file_utils_test.py"],
timeout = "short",
deps = [
":file_utils",
],
Expand Down
21 changes: 19 additions & 2 deletions snowflake/ml/_internal/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
import io
import os
import shutil
import tempfile
import zipfile
from typing import Generator, Optional
from typing import IO, Generator, Optional

GENERATED_PY_FILE_EXT = (".pyc", ".pyo", ".pyd", ".pyi")

Expand Down Expand Up @@ -69,7 +70,6 @@ def zip_file_or_directory_to_stream(

with io.BytesIO() as input_stream:
with zipfile.ZipFile(input_stream, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:

if os.path.realpath(path) != os.path.realpath(start_path):
cur_path = os.path.dirname(path)
while os.path.realpath(cur_path) != os.path.realpath(start_path):
Expand All @@ -92,3 +92,20 @@ def zip_file_or_directory_to_stream(
zf.write(path, os.path.relpath(path, start_path))

yield input_stream


@contextlib.contextmanager
def unzip_stream_in_temp_dir(stream: IO[bytes], temp_root: Optional[str] = None) -> Generator[str, None, None]:
"""Unzip an IO stream into a temporary directory.
Args:
stream: The input stream.
temp_root: The root directory where the temporary directory should created in. Defaults to None.
Yields:
The path to the created temporary directory.
"""
with tempfile.TemporaryDirectory(dir=temp_root) as tempdir:
with zipfile.ZipFile(stream, mode="r", compression=zipfile.ZIP_DEFLATED) as zf:
zf.extractall(path=tempdir)
yield tempdir
Loading

0 comments on commit 69a4f0d

Please sign in to comment.