diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 78808d17f05..9be5ee35486 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -188,7 +188,10 @@ def tune( retain_trials: bool = False, packages_to_install: List[str] = None, pip_index_url: str = "https://pypi.org/simple", - metrics_collector_config: Dict[str, Any] = {"kind": "StdOut"}, + metrics_collector_config: Dict[str, Any] = { + "kind": "StdOut", + "custom_collector": None, + }, ): """Create HyperParameter Tuning Katib Experiment from the objective function. @@ -253,9 +256,22 @@ def tune( to the base image packages. These packages are installed before executing the objective function. pip_index_url: The PyPI url from which to install Python packages. - metrics_collector_config: Specify the config of metrics collector, - for example, `metrics_collector_config = {"kind": "Push"}`. - Currently, we only support `StdOut` and `Push` metrics collector. + + `metrics_collector_config`: Specify the configuration + for the metrics collector with following keys: + - **kind**: Specify the kind of Metrics Collector. Currently supported values are: + - `StdOut`: Collects metrics from standard output. + - `None`: No metrics collection. + - `File`: Writes metrics to a file. + - `TensorFlowEvent`: Collects metrics in TensorFlow Event format. + - `PrometheusMetric`: Exposes metrics in a Prometheus-compatible format. + - `Custom`: For custom metrics collection. Use the "custom_collector" + key to specify the collector instance. + + - **custom_collector**: If the `kind` is set to `Custom`, you must provide an + instance of a custom `V1Container` as the value. + For example:`metrics_collector_config = + {"kind" : "Custom", "custom_collector": }`. Raises: ValueError: Function arguments have incorrect type or value. @@ -396,7 +412,10 @@ def tune( # Up to now, we only support parameter `kind`, of which default value # is `StdOut`, to specify the kind of metrics collector. experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( - collector=models.V1beta1CollectorSpec(kind=metrics_collector_config["kind"]) + collector=models.V1beta1CollectorSpec( + kind=metrics_collector_config["kind"], + custom_collector=metrics_collector_config["custom_collector"], + ) ) # Create Trial specification. diff --git a/test/e2e/v1beta1/scripts/gh-actions/Dockerfile.dummy-collector b/test/e2e/v1beta1/scripts/gh-actions/Dockerfile.dummy-collector new file mode 100755 index 00000000000..d592af3323e --- /dev/null +++ b/test/e2e/v1beta1/scripts/gh-actions/Dockerfile.dummy-collector @@ -0,0 +1,9 @@ +FROM python:3.8-slim + +WORKDIR /app + +COPY test/e2e/v1beta1/scripts/gh-actions/dummy-collector.py . + +RUN pip install kubernetes + +CMD ["python", "dummy-collector.py"] \ No newline at end of file diff --git a/test/e2e/v1beta1/scripts/gh-actions/build-load.sh b/test/e2e/v1beta1/scripts/gh-actions/build-load.sh index cb0ea03cd5a..862c5d1ac3b 100755 --- a/test/e2e/v1beta1/scripts/gh-actions/build-load.sh +++ b/test/e2e/v1beta1/scripts/gh-actions/build-load.sh @@ -167,6 +167,9 @@ done if "$TUNE_API"; then echo -e "\nPulling and building testing image for tune function..." _build_containers "suggestion-hyperopt" "$CMD_PREFIX/suggestion/hyperopt/$VERSION/Dockerfile" + + echo -e "\nBuilding dummy collector image..." + _build_containers "dummy-collector" "test/e2e/v1beta1/scripts/gh-actions/Dockerfile.dummy-collector" fi echo -e "\nCleanup Build Cache...\n" diff --git a/test/e2e/v1beta1/scripts/gh-actions/dummy-collector.py b/test/e2e/v1beta1/scripts/gh-actions/dummy-collector.py new file mode 100755 index 00000000000..5fa2497a828 --- /dev/null +++ b/test/e2e/v1beta1/scripts/gh-actions/dummy-collector.py @@ -0,0 +1,27 @@ +import argparse +import logging +import time + +from kubernetes import client, config + +# The default logging config. +logging.basicConfig(level=logging.INFO) + +def collect_metrics(metric_name : str): + config.load_incluster_config() + v1 = client.CoreV1Api() + + while True: + dummy_metric_value = 42 + logging.info(f"Collected dummy metric: {metric_name}={dummy_metric_value}") + + time.sleep(10) # Collect metrics every 10 seconds + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--metric-name", type=str, required=True, help="Name of the metric to collect") + args = parser.parse_args() + + collect_metrics(args.metric_name) + + \ No newline at end of file diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.py old mode 100644 new mode 100755 diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py old mode 100644 new mode 100755 index c9d1cb2ee43..745787f0077 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -3,6 +3,7 @@ from kubeflow.katib import KatibClient, search from kubernetes import client +from kubernetes.client import V1Container from verify import verify_experiment_results # Experiment timeout is 40 min. @@ -11,8 +12,68 @@ # The default logging config. logging.basicConfig(level=logging.INFO) +def run_e2e_experiment_create_by_tune_custom_metrics_collector( + katib_client: KatibClient, + exp_name: str, + exp_namespace: str, +): + # Create Katib Experiment and wait until it is finished. + logging.debug("Creating Experiment: {}/{}".format(exp_namespace, exp_name)) + + # Use the test case from get-started tutorial. + # https://www.kubeflow.org/docs/components/katib/getting-started/#getting-started-with-katib-python-sdk + # [1] Create an objective function. + def objective(parameters): + import time + time.sleep(5) + result = 4 * int(parameters["a"]) - float(parameters["b"]) ** 2 + print(f"result={result}") + + # [2] Create hyperparameter search space. + parameters = { + "a": search.int(min=10, max=20), + "b": search.double(min=0.1, max=0.2) + } + + # [3] Create a dummy metric collector (DOES NOT HAVE A IMAGE) + metric_collector = V1Container( + name="dummy-collector", + image="dummy-collector:latest", + command=["python", "/app/dummy-collector.py"], + args=["--metric-name=result"], + env=[ + client.V1EnvVar(name="EXPERIMENT_NAME", value=exp_name), + client.V1EnvVar(name="EXPERIMENT_NAMESPACE", value=exp_namespace) + ] + ) + + # [4] Create Katib Experiment with 4 Trials and 2 CPUs per Trial. + # And Wait until Experiment reaches Succeeded condition. + katib_client.tune( + name=exp_name, + namespace=exp_namespace, + objective=objective, + parameters=parameters, + objective_metric_name="result", + max_trial_count=4, + resources_per_trial={"cpu": "2"}, + metrics_collector_config={ + "kind": "Custom", + "custom_collector": metric_collector, + }, + ) + experiment = katib_client.wait_for_experiment_condition( + exp_name, exp_namespace, timeout=EXPERIMENT_TIMEOUT + ) + + # Verify the Experiment results. + verify_experiment_results(katib_client, experiment, exp_name, exp_namespace) + + # Print the Experiment and Suggestion. + logging.debug(katib_client.get_experiment(exp_name, exp_namespace)) + logging.debug(katib_client.get_suggestion(exp_name, exp_namespace)) -def run_e2e_experiment_create_by_tune( +def run_e2e_experiment_create_by_tune_default_metrics_collector( katib_client: KatibClient, exp_name: str, exp_namespace: str, @@ -57,7 +118,6 @@ def objective(parameters): logging.debug(katib_client.get_experiment(exp_name, exp_namespace)) logging.debug(katib_client.get_suggestion(exp_name, exp_namespace)) - if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( @@ -78,11 +138,12 @@ def objective(parameters): namespace_labels['katib.kubeflow.org/metrics-collector-injection'] = 'enabled' client.CoreV1Api().patch_namespace(args.namespace, {'metadata': {'labels': namespace_labels}}) - # Test with run_e2e_experiment_create_by_tune - exp_name = "tune-example" + # Test with run_e2e_experiment_create_by_tune_default_metrics_collector exp_namespace = args.namespace try: - run_e2e_experiment_create_by_tune(katib_client, exp_name, exp_namespace) + exp_name = "tune-example-default-metrics-collector" + logging.info(f"Runnning E2E for Experiment created by tune: {exp_namespace}/{exp_name}") + run_e2e_experiment_create_by_tune_default_metrics_collector(katib_client, exp_name, exp_namespace) logging.info("---------------------------------------------------------------") logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name}") except Exception as e: @@ -94,3 +155,21 @@ def objective(parameters): logging.info("---------------------------------------------------------------") logging.info("---------------------------------------------------------------") katib_client.delete_experiment(exp_name, exp_namespace) + + + # Test with run_e2e_experiment_create_by_tune_custom_metrics_collector + try: + exp_name = "tune-example-custom-metrics-collector" + logging.info(f"Runnning E2E for Experiment created by tune: {exp_namespace}/{exp_name}") + run_e2e_experiment_create_by_tune_custom_metrics_collector(katib_client, exp_name, exp_namespace) + logging.info("---------------------------------------------------------------") + logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name}") + except Exception as e: + logging.info("---------------------------------------------------------------") + logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}") + raise e + finally: + # Delete the Experiment. + logging.info("---------------------------------------------------------------") + logging.info("---------------------------------------------------------------") + katib_client.delete_experiment(exp_name, exp_namespace) \ No newline at end of file diff --git a/test/e2e/v1beta1/scripts/gh-actions/verify.py b/test/e2e/v1beta1/scripts/gh-actions/verify.py old mode 100644 new mode 100755