Skip to content

Commit

Permalink
Project import generated by Copybara. (#80)
Browse files Browse the repository at this point in the history
GitOrigin-RevId: 2059912491dc8ea84f5dd0196f371206ee57919c

Co-authored-by: Snowflake Authors <[email protected]>
  • Loading branch information
sfc-gh-anavalos and Snowflake Authors authored Dec 18, 2023
1 parent 7dd0738 commit 35d2b4f
Show file tree
Hide file tree
Showing 163 changed files with 11,022 additions and 4,426 deletions.
20 changes: 18 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,30 @@
# Release History

## 1.1.2

### Bug Fixes

- Generic: Fix the issue that stack trace is hidden by telemetry unexpectedly.
- Model Development: Execute model signature inference without materializing full dataframe in memory.
- Model Registry: Fix occasional 'snowflake-ml-python library does not exist' error when deploying to SPCS.

### Behavior Changes

- Model Registry: When calling `predict` with Snowpark DataFrame, both inferred or normalized column names are accepted.
- Model Registry: When logging a Snowpark ML Modeling Model, sample input data or manually provided signature will be
ignored since they are not necessary.

### New Features

- Model Development: SQL implementation of binary `precision_score` metric.

## 1.1.1

### Bug Fixes

- Model Registry: The `predict` target method on registered models is now compatible with unsupervised estimators.
- Model Development: Fix confusion_matrix incorrect results when the row number cannot be divided by the batch size.

### Behavior Changes

### New Features

- Introduced passthrough_col param in Modeling API. This new param is helpful in scenarios
Expand Down
2 changes: 2 additions & 0 deletions bazel/environments/conda-env-snowflake.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,13 @@ dependencies:
- packaging==23.0
- pandas==1.5.3
- protobuf==3.20.3
- pyarrow==10.0.1
- pytest==7.4.0
- pytimeparse==1.1.8
- pytorch==2.0.1
- pyyaml==6.0
- requests==2.29.0
- retrying==1.3.3
- ruamel.yaml==0.17.21
- s3fs==2023.3.0
- scikit-learn==1.3.0
Expand Down
2 changes: 2 additions & 0 deletions bazel/environments/conda-env.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,13 @@ dependencies:
- packaging==23.0
- pandas==1.5.3
- protobuf==3.20.3
- pyarrow==10.0.1
- pytest==7.4.0
- pytimeparse==1.1.8
- pytorch==2.0.1
- pyyaml==6.0
- requests==2.29.0
- retrying==1.3.3
- ruamel.yaml==0.17.21
- s3fs==2023.3.0
- scikit-learn==1.3.0
Expand Down
2 changes: 2 additions & 0 deletions bazel/environments/conda-gpu-env.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,14 @@ dependencies:
- packaging==23.0
- pandas==1.5.3
- protobuf==3.20.3
- pyarrow==10.0.1
- pytest==7.4.0
- pytimeparse==1.1.8
- pytorch::pytorch-cuda==11.7.*
- pytorch::pytorch==2.0.1
- pyyaml==6.0
- requests==2.29.0
- retrying==1.3.3
- ruamel.yaml==0.17.21
- s3fs==2023.3.0
- scikit-learn==1.3.0
Expand Down
4 changes: 3 additions & 1 deletion ci/conda_recipe/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ build:
noarch: python
package:
name: snowflake-ml-python
version: 1.1.1
version: 1.1.2
requirements:
build:
- python
Expand All @@ -33,9 +33,11 @@ requirements:
- numpy>=1.23,<2
- packaging>=20.9,<24
- pandas>=1.0.0,<2
- pyarrow
- pytimeparse>=1.1.8,<2
- pyyaml>=6.0,<7
- requests
- retrying>=1.3.3,<2
- s3fs>=2022.11,<2024
- scikit-learn>=1.2.1,<1.4
- scipy>=1.9,<2
Expand Down
2 changes: 2 additions & 0 deletions codegen/codegen_rules.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ def autogen_estimators(module, estimator_info_list):
"//snowflake/ml/model:model_signature",
"//snowflake/ml/model/_signatures:utils",
"//snowflake/ml/modeling/_internal:estimator_utils",
"//snowflake/ml/modeling/_internal:model_trainer",
"//snowflake/ml/modeling/_internal:model_trainer_builder",
],
)

Expand Down
94 changes: 59 additions & 35 deletions codegen/sklearn_wrapper_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,44 +16,58 @@
LOAD_DIABETES = "load_diabetes"


ADDITIONAL_PARAM_DESCRIPTIONS = """
ADDITIONAL_PARAM_DESCRIPTIONS = {
"input_cols": """
input_cols: Optional[Union[str, List[str]]]
A string or list of strings representing column names that contain features.
If this parameter is not specified, all columns in the input DataFrame except
the columns specified by label_cols, sample_weight_col, and passthrough_cols
parameters are considered input columns.
parameters are considered input columns. Input columns can also be set after
initialization with the `set_input_cols` method.
""",
"label_cols": """
label_cols: Optional[Union[str, List[str]]]
A string or list of strings representing column names that contain labels.
This is a required param for estimators, as there is no way to infer these
columns. If this parameter is not specified, then object is fitted without
labels (like a transformer).
Label columns must be specified with this parameter during initialization
or with the `set_label_cols` method before fitting.
""",
"output_cols": """
output_cols: Optional[Union[str, List[str]]]
A string or list of strings representing column names that will store the
output of predict and transform operations. The length of output_cols must
match the expected number of output columns from the specific estimator or
match the expected number of output columns from the specific predictor or
transformer class used.
If this parameter is not specified, output column names are derived by
adding an OUTPUT_ prefix to the label column names. These inferred output
column names work for estimator's predict() method, but output_cols must
be set explicitly for transformers.
If you omit this parameter, output column names are derived by adding an
OUTPUT_ prefix to the label column names for supervised estimators, or
OUTPUT_<IDX>for unsupervised estimators. These inferred output column names
work for predictors, but output_cols must be set explicitly for transformers.
In general, explicitly specifying output column names is clearer, especially
if you don’t specify the input column names.
To transform in place, pass the same names for input_cols and output_cols.
be set explicitly for transformers. Output columns can also be set after
initialization with the `set_output_cols` method.
""",
"sample_weight_col": """
sample_weight_col: Optional[str]
A string representing the column name containing the sample weights.
This argument is only required when working with weighted datasets.
This argument is only required when working with weighted datasets. Sample
weight column can also be set after initialization with the
`set_sample_weight_col` method.
""",
"passthrough_cols": """
passthrough_cols: Optional[Union[str, List[str]]]
A string or a list of strings indicating column names to be excluded from any
operations (such as train, transform, or inference). These specified column(s)
will remain untouched throughout the process. This option is helpful in scenarios
requiring automatic input_cols inference, but need to avoid using specific
columns, like index columns, during training or inference.
columns, like index columns, during training or inference. Passthrough columns
can also be set after initialization with the `set_passthrough_cols` method.
""",
"drop_input_cols": """
drop_input_cols: Optional[bool], default=False
If set, the response of predict(), transform() methods will not contain input columns.
"""
""",
}

ADDITIONAL_METHOD_DESCRIPTION = """
Raises:
Expand Down Expand Up @@ -448,7 +462,6 @@ class WrapperGeneratorBase:
is contained in.
estimator_imports GENERATED Imports needed for the estimator / fit()
call.
wrapper_provider_class GENERATED Class name of wrapper provider.
------------------------------------------------------------------------------------
SIGNATURES AND ARGUMENTS
------------------------------------------------------------------------------------
Expand Down Expand Up @@ -545,7 +558,6 @@ def __init__(self, module_name: str, class_object: Tuple[str, type]) -> None:
self.estimator_imports = ""
self.estimator_imports_list: List[str] = []
self.score_sproc_imports: List[str] = []
self.wrapper_provider_class = ""
self.additional_import_statements = ""

# Test strings
Expand Down Expand Up @@ -630,10 +642,11 @@ def _populate_class_doc_fields(self) -> None:
class_docstring = inspect.getdoc(self.class_object[1]) or ""
class_docstring = class_docstring.rsplit("Attributes\n", 1)[0]

parameters_heading = "Parameters\n----------\n"
class_description, param_description = (
class_docstring.rsplit("Parameters\n", 1)
if len(class_docstring.rsplit("Parameters\n", 1)) == 2
else (class_docstring, "----------\n")
class_docstring.rsplit(parameters_heading, 1)
if len(class_docstring.rsplit(parameters_heading, 1)) == 2
else (class_docstring, "")
)

# Extract the first sentence of the class description
Expand All @@ -645,9 +658,11 @@ def _populate_class_doc_fields(self) -> None:
f"]\n({self.get_doc_link()})"
)

# Add SnowML specific param descriptions.
param_description = "Parameters\n" + param_description.strip()
param_description += ADDITIONAL_PARAM_DESCRIPTIONS
# Add SnowML specific param descriptions before third party parameters.
snowml_parameters = ""
for d in ADDITIONAL_PARAM_DESCRIPTIONS.values():
snowml_parameters += d
param_description = f"{parameters_heading}{snowml_parameters}\n{param_description.strip()}"

class_docstring = f"{class_description}\n\n{param_description}"
class_docstring = textwrap.indent(class_docstring, " ").strip()
Expand Down Expand Up @@ -718,12 +733,23 @@ def _populate_function_names_and_signatures(self) -> None:
for member in inspect.getmembers(self.class_object[1]):
if member[0] == "__init__":
self.original_init_signature = inspect.signature(member[1])
elif member[0] == "fit":
original_fit_signature = inspect.signature(member[1])
if original_fit_signature.parameters["y"].default is None:
# The fit does not require labels, so our label_cols argument is optional.
ADDITIONAL_PARAM_DESCRIPTIONS[
"label_cols"
] = """
label_cols: Optional[Union[str, List[str]]]
This parameter is optional and will be ignored during fit. It is present here for API consistency by convention.
"""

signature_lines = []
sklearn_init_lines = []
init_member_args = []
has_kwargs = False
sklearn_init_args_dict_list = []

for k, v in self.original_init_signature.parameters.items():
if k == "self":
signature_lines.append("self")
Expand Down Expand Up @@ -855,9 +881,9 @@ def generate(self) -> "WrapperGeneratorBase":
self._populate_flags()
self._populate_class_names()
self._populate_import_statements()
self._populate_class_doc_fields()
self._populate_function_doc_fields()
self._populate_function_names_and_signatures()
self._populate_class_doc_fields()
self._populate_file_paths()
self._populate_integ_test_fields()
return self
Expand All @@ -876,13 +902,8 @@ def generate(self) -> "SklearnWrapperGenerator":
# Populate all the common values
super().generate()

is_model_selector = WrapperGeneratorFactory._is_class_of_type(self.class_object[1], "BaseSearchCV")

# Populate SKLearn specific values
self.estimator_imports_list.extend(["import sklearn", f"import {self.root_module_name}"])
self.wrapper_provider_class = (
"SklearnModelSelectionWrapperProvider" if is_model_selector else "SklearnWrapperProvider"
)
self.score_sproc_imports = ["sklearn"]

if "random_state" in self.original_init_signature.parameters.keys():
Expand Down Expand Up @@ -982,6 +1003,9 @@ def generate(self) -> "SklearnWrapperGenerator":
if self._is_hist_gradient_boosting_regressor:
self.test_estimator_input_args_list.extend(["min_samples_leaf=1", "max_leaf_nodes=100"])

self.deps = (
"f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'"
)
self.supported_export_method = "to_sklearn"
self.unsupported_export_methods = ["to_xgboost", "to_lightgbm"]
self._construct_string_from_lists()
Expand Down Expand Up @@ -1010,10 +1034,10 @@ def generate(self) -> "XGBoostWrapperGenerator":
["random_state=0", "subsample=1.0", "colsample_bynode=1.0", "n_jobs=1"]
)
self.score_sproc_imports = ["xgboost"]
self.wrapper_provider_class = "XGBoostWrapperProvider"
# TODO(snandamuri): Replace cloudpickle with joblib after latest version of joblib is added to snowflake conda.
self.supported_export_method = "to_xgboost"
self.unsupported_export_methods = ["to_sklearn", "to_lightgbm"]
self.deps = "f'numpy=={np.__version__}', f'xgboost=={xgboost.__version__}', f'cloudpickle=={cp.__version__}'"
self._construct_string_from_lists()
return self

Expand All @@ -1039,8 +1063,8 @@ def generate(self) -> "LightGBMWrapperGenerator":
self.estimator_imports_list.append("import lightgbm")
self.test_estimator_input_args_list.extend(["random_state=0", "n_jobs=1"])
self.score_sproc_imports = ["lightgbm"]
self.wrapper_provider_class = "LightGBMWrapperProvider"

self.deps = "f'numpy=={np.__version__}', f'lightgbm=={lightgbm.__version__}', f'cloudpickle=={cp.__version__}'"
self.supported_export_method = "to_lightgbm"
self.unsupported_export_methods = ["to_sklearn", "to_xgboost"]
self._construct_string_from_lists()
Expand Down
Loading

0 comments on commit 35d2b4f

Please sign in to comment.