Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add catboost to the third-party integration tests #17267

Draft
wants to merge 4 commits into
base: branch-24.12
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .github/workflows/pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ jobs:
- unit-tests-cudf-pandas
- pandas-tests
- pandas-tests-diff
- third-party-integration-tests-cudf-pandas
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
if: always()
Expand Down Expand Up @@ -302,3 +303,14 @@ jobs:
node_type: cpu4
build_type: pull-request
run_script: "ci/cudf_pandas_scripts/pandas-tests/diff.sh"
third-party-integration-tests-cudf-pandas:
needs: wheel-build-cudf
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
build_type: pull-request
node_type: "gpu-v100-latest-1"
arch: "amd64"
container_image: "rapidsai/ci-conda:latest"
run_script: |
ci/cudf_pandas_scripts/third-party-integration/test.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,13 @@ files:
- py_version
- test_base
- test_xgboost
test_catboost:
output: none
includes:
- cuda_version
- py_version
- test_base
- test_catboost
test_cuml:
output: none
includes:
Expand Down Expand Up @@ -244,6 +251,16 @@ dependencies:
- pip
- pip:
- xgboost>=2.0.1
test_catboost:
common:
- output_types: conda
packages:
- numpy
- scipy
- scikit-learn
- pip
- pip:
- catboost
test_cuml:
common:
- output_types: conda
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import numpy as np
import pandas as pd
import pytest
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.datasets import make_classification, make_regression

rng = np.random.default_rng(seed=42)


def assert_catboost_equal(expect, got, rtol=1e-7, atol=0.0):
if isinstance(expect, (tuple, list)):
assert len(expect) == len(got)
for e, g in zip(expect, got):
assert_catboost_equal(e, g, rtol, atol)
elif isinstance(expect, np.ndarray):
np.testing.assert_allclose(expect, got, rtol=rtol, atol=atol)
elif isinstance(expect, pd.DataFrame):
pd.testing.assert_frame_equal(expect, got)
elif isinstance(expect, pd.Series):
pd.testing.assert_series_equal(expect, got)
else:
assert expect == got


pytestmark = pytest.mark.assert_eq(fn=assert_catboost_equal)


@pytest.fixture
def regression_data():
X, y = make_regression(n_samples=100, n_features=10, random_state=42)
return pd.DataFrame(X), pd.Series(y)


@pytest.fixture
def classification_data():
X, y = make_classification(
n_samples=100, n_features=10, n_classes=2, random_state=42
)
return pd.DataFrame(X), pd.Series(y)


def test_catboost_regressor_with_dataframe(regression_data):
X, y = regression_data
model = CatBoostRegressor(iterations=10, verbose=0)
model.fit(X, y)
predictions = model.predict(X)
return predictions


def test_catboost_regressor_with_numpy(regression_data):
X, y = regression_data
model = CatBoostRegressor(iterations=10, verbose=0)
model.fit(X.values, y.values)
predictions = model.predict(X.values)
return predictions


def test_catboost_classifier_with_dataframe(classification_data):
X, y = classification_data
model = CatBoostClassifier(iterations=10, verbose=0)
model.fit(X, y)
predictions = model.predict(X)
return predictions


def test_catboost_classifier_with_numpy(classification_data):
X, y = classification_data
model = CatBoostClassifier(iterations=10, verbose=0)
model.fit(X.values, y.values)
predictions = model.predict(X.values)
return predictions


def test_catboost_with_pool_and_dataframe(regression_data):
X, y = regression_data
train_pool = Pool(X, y)
model = CatBoostRegressor(iterations=10, verbose=0)
model.fit(train_pool)
predictions = model.predict(X)
return predictions


def test_catboost_with_pool_and_numpy(regression_data):
X, y = regression_data
train_pool = Pool(X.values, y.values)
model = CatBoostRegressor(iterations=10, verbose=0)
model.fit(train_pool)
predictions = model.predict(X.values)
return predictions


def test_catboost_with_categorical_features():
data = {
"numerical_feature": rng.standard_normal(100),
"categorical_feature": rng.choice(["A", "B", "C"], size=100),
"target": rng.integers(0, 2, size=100),
}
df = pd.DataFrame(data)
X = df[["numerical_feature", "categorical_feature"]]
y = df["target"]
cat_features = ["categorical_feature"]
model = CatBoostClassifier(
iterations=10, verbose=0, cat_features=cat_features
)
model.fit(X, y)
predictions = model.predict(X)
return predictions


@pytest.mark.parametrize(
"X, y",
[
(
pd.DataFrame(rng.standard_normal((100, 5))),
pd.Series(rng.standard_normal(100)),
),
(rng.standard_normal(100, 5), rng.standard_normal(100)),
],
)
def test_catboost_train_test_split(X, y):
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
model = CatBoostRegressor(iterations=10, verbose=0)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
return len(X_train), len(X_test), len(y_train), len(y_test), predictions
Loading