kubeflow · prakhar479 · Aug 9, 2024 · Aug 13, 2024 · Aug 15, 2024 · Aug 17, 2024
diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
@@ -186,7 +186,10 @@ def tune(
         retain_trials: bool = False,
         packages_to_install: List[str] = None,
         pip_index_url: str = "https://pypi.org/simple",
-        metrics_collector_config: Dict[str, Any] = {"kind": "StdOut"},
+        metrics_collector_config: Dict[str, Any] = {
+            "kind": "StdOut", 
+            "custom_collector": None
+        },
     ):
         """Create HyperParameter Tuning Katib Experiment from the objective function.
 
@@ -251,8 +254,8 @@ def tune(
             pip_index_url: The PyPI url from which to install Python packages.
             metrics_collector_config: Specify the config of metrics collector, 
                 for example, `metrics_collector_config = {"kind": "Push"}`.
-                Currently, we only support `StdOut` and `Push` metrics collector.
-
+                for using custom metric collectors use "custom_collector" key and provide instance of custom V1Container as value,
+                for example, `metrics_collector_config = {"kind" : "Custom", "custom_collector": <Instance of V1Container>}`.
         Raises:
             ValueError: Function arguments have incorrect type or value.
             TimeoutError: Timeout to create Katib Experiment.
@@ -387,7 +390,10 @@ def tune(
         # Add metrics collector to the Katib Experiment.
         # Up to now, We only support parameter `kind`, of which default value is `StdOut`, to specify the kind of metrics collector. 
         experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec(
-            collector=models.V1beta1CollectorSpec(kind=metrics_collector_config["kind"])
+            collector=models.V1beta1CollectorSpec(  
+                kind=metrics_collector_config["kind"],
+                custom_collector=metrics_collector_config["custom_collector"],
+            )
         )
 
         # Create Trial specification.

diff --git a/test/e2e/v1beta1/scripts/gh-actions/Dockerfile.dummy-collector b/test/e2e/v1beta1/scripts/gh-actions/Dockerfile.dummy-collector
@@ -0,0 +1,9 @@
+FROM python:3.8-slim
+
+WORKDIR /app
+
+COPY dummy-collector.py .
+
+RUN pip install kubernetes
+
+CMD ["python", "dummy-collector.py"]
diff --git a/test/e2e/v1beta1/scripts/gh-actions/build-load.sh b/test/e2e/v1beta1/scripts/gh-actions/build-load.sh
@@ -167,6 +167,9 @@ done
 if "$TUNE_API"; then
   echo -e "\nPulling and building testing image for tune function..."
   _build_containers "suggestion-hyperopt" "$CMD_PREFIX/suggestion/hyperopt/$VERSION/Dockerfile"
+
+  echo -e "\nBuilding dummy collector image..."
+  _build_containers "dummy-collector" "test/e2e/v1beta1/scripts/gh-actions/Dockerfile.dummy-collector"
 fi
 
 echo -e "\nCleanup Build Cache...\n"

diff --git a/test/e2e/v1beta1/scripts/gh-actions/dummy-collector.py b/test/e2e/v1beta1/scripts/gh-actions/dummy-collector.py
@@ -0,0 +1,28 @@
+import argparse
+import logging
+import time
+
+from kubernetes import client
+from kubernetes import config
+
+# The default logging config.
+logging.basicConfig(level=logging.INFO)
+
+def collect_metrics(metric_name : str):
+    config.load_incluster_config()
+    v1 = client.CoreV1Api()
+
+    while True:
+        dummy_metric_value = 42 
+        logging.info(f"Collected dummy metric: {metric_name}={dummy_metric_value}")
+
+        time.sleep(10)  # Collect metrics every 10 seconds
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--metric-name", type=str, required=True, help="Name of the metric to collect")
+    args = parser.parse_args()
+
+    collect_metrics(args.metric_name)
+
+
diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
@@ -4,6 +4,7 @@
 from kubeflow.katib import KatibClient
 from kubeflow.katib import search
 from kubernetes import client
+from kubernetes.client import V1Container
 from verify import verify_experiment_results
 
 # Experiment timeout is 40 min.
@@ -36,7 +37,19 @@ def objective(parameters):
         "b": search.double(min=0.1, max=0.2)
     }
 
-    # [3] Create Katib Experiment with 4 Trials and 2 CPUs per Trial.
+    # [3] Create a dummy metric collector (DOES NOT HAVE A IMAGE)
+    metric_collector = V1Container(
+        name="dummy-collector",
+        image="dummy-collector:latest",
+        command=["python", "/app/dummy-collector.py"],
+        args=["--metric-name=result"],
+        env=[
+            client.V1EnvVar(name="EXPERIMENT_NAME", value=exp_name),
+            client.V1EnvVar(name="EXPERIMENT_NAMESPACE", value=exp_namespace)
+        ]
+    )
+
+    # [4] Create Katib Experiment with 4 Trials and 2 CPUs per Trial.
     # And Wait until Experiment reaches Succeeded condition.
     katib_client.tune(
         name=exp_name,
@@ -46,6 +59,16 @@ def objective(parameters):
         objective_metric_name="result",
         max_trial_count=4,
         resources_per_trial={"cpu": "2"},
+        metrics_collector_config={
+            "collector": {
+                "kind": "Custom",
+                "customCollector": {
+                    "image": "dummy-collector:latest",
+                    "command": ["python", "/app/dummy-collector.py"],
+                    "args": ["--metric-name=result"]
+                }
+            }
+        },
     )
     experiment = katib_client.wait_for_experiment_condition(
         exp_name, exp_namespace, timeout=EXPERIMENT_TIMEOUT
@@ -94,4 +117,4 @@ def objective(parameters):
         # Delete the Experiment.
         logging.info("---------------------------------------------------------------")
         logging.info("---------------------------------------------------------------")
-        katib_client.delete_experiment(exp_name, exp_namespace)
+        katib_client.delete_experiment(exp_name, exp_namespace)