feat regression benchmark service

Kayzz99 · Kayzz99 · commit e790456fcc61 · 2024-03-05T10:42:00.000+01:00
diff --git a/requirements-all.txt b/requirements-all.txt
@@ -0,0 +1,16 @@
+click==8.1.7
+joblib==1.3.2
+lazypredict-nightly==0.3.0
+lightgbm==4.3.0
+numpy==1.26.4
+pandas==2.2.1
+python-dateutil==2.9.0.post0
+pytz==2024.1
+scikit-learn==1.4.1.post1
+scipy==1.12.0
+six==1.16.0
+threadpoolctl==3.3.0
+tqdm==4.66.2
+tzdata==2024.1
+xgboost==2.0.3
+Werkzeug==2.3.6
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,2 @@
 common-code[test] @ git+https://github.com/swiss-ai-center/common-code.git@main
+lazypredict-nightly==0.3.0
diff --git a/src/main.py b/src/main.py
@@ -14,20 +14,26 @@
 from common_code.tasks.models import TaskData
 from common_code.service.models import Service
 from common_code.service.enums import ServiceStatus
-from common_code.common.enums import FieldDescriptionType, ExecutionUnitTagName, ExecutionUnitTagAcronym
+from common_code.common.enums import (
+    FieldDescriptionType,
+    ExecutionUnitTagName,
+    ExecutionUnitTagAcronym,
+)
 from common_code.common.models import FieldDescription, ExecutionUnitTag
 from contextlib import asynccontextmanager
 
 # Imports required by the service's model
-# TODO: 1. ADD REQUIRED IMPORTS (ALSO IN THE REQUIREMENTS.TXT)
+import pandas as pd
+from lazypredict.Supervised import LazyRegressor
+from sklearn.model_selection import train_test_split
+import io
 
 settings = get_settings()
 
 
 class MyService(Service):
-    # TODO: 2. CHANGE THIS DESCRIPTION
     """
-    My service model
+    Benchmark multiple models on a dataset and return the results.
     """
 
     # Any additional fields must be excluded for Pydantic to work
@@ -36,32 +42,22 @@ class MyService(Service):
 
     def __init__(self):
         super().__init__(
-            # TODO: 3. CHANGE THE SERVICE NAME AND SLUG
-            name="My Service",
-            slug="my-service",
+            name="Regression Benchmark",
+            slug="regression-benchmark",
             url=settings.service_url,
             summary=api_summary,
             description=api_description,
             status=ServiceStatus.AVAILABLE,
-            # TODO: 4. CHANGE THE INPUT AND OUTPUT FIELDS, THE TAGS AND THE HAS_AI VARIABLE
             data_in_fields=[
-                FieldDescription(
-                    name="image",
-                    type=[
-                        FieldDescriptionType.IMAGE_PNG,
-                        FieldDescriptionType.IMAGE_JPEG,
-                    ],
-                ),
+                FieldDescription(name="dataset", type=[FieldDescriptionType.TEXT_CSV]),
             ],
             data_out_fields=[
-                FieldDescription(
-                    name="result", type=[FieldDescriptionType.APPLICATION_JSON]
-                ),
+                FieldDescription(name="result", type=[FieldDescriptionType.TEXT_PLAIN]),
             ],
             tags=[
                 ExecutionUnitTag(
-                    name=ExecutionUnitTagName.IMAGE_PROCESSING,
-                    acronym=ExecutionUnitTagAcronym.IMAGE_PROCESSING,
+                    name=ExecutionUnitTagName.DATA_PREPROCESSING,
+                    acronym=ExecutionUnitTagAcronym.DATA_PREPROCESSING,
                 ),
             ],
             has_ai=False,
@@ -70,18 +66,38 @@ def __init__(self):
         )
         self._logger = get_logger(settings)
 
-    # TODO: 5. CHANGE THE PROCESS METHOD (CORE OF THE SERVICE)
     def process(self, data):
-        # NOTE that the data is a dictionary with the keys being the field names set in the data_in_fields
-        # The objects in the data variable are always bytes. It is necessary to convert them to the desired type
-        # before using them.
-        # raw = data["image"].data
-        # input_type = data["image"].type
-        # ... do something with the raw data
-
-        # NOTE that the result must be a dictionary with the keys being the field names set in the data_out_fields
+
+        raw = str(data["dataset"].data)
+        raw = (
+            raw.replace(",", ";")
+            .replace("\\n", "\n")
+            .replace("\\r", "\n")
+            .replace("b'", "")
+        )
+
+        lines = raw.splitlines()
+        if lines[-1] == "" or lines[-1] == "'":
+            lines.pop()
+        raw = "\n".join(lines)
+
+        data_df = pd.read_csv(io.StringIO(raw), sep=";")
+
+        X = data_df.drop("target", axis=1)
+        y = data_df["target"]
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=0.2, random_state=42
+        )
+        reg = LazyRegressor(verbose=0, custom_metric=None)
+        models, predictions = reg.fit(X_train, X_test, y_train, y_test)
+
+        buf = io.BytesIO()
+        buf.write(models.to_string().encode("utf-8"))
+
         return {
-            "result": TaskData(data=..., type=FieldDescriptionType.APPLICATION_JSON)
+            "result": TaskData(
+                data=buf.getvalue(), type=FieldDescriptionType.TEXT_PLAIN
+            ),
         }
 
 
@@ -115,7 +131,9 @@ async def announce():
         for engine_url in settings.engine_urls:
             announced = False
             while not announced and retries > 0:
-                announced = await service_service.announce_service(my_service, engine_url)
+                announced = await service_service.announce_service(
+                    my_service, engine_url
+                )
                 retries -= 1
                 if not announced:
                     time.sleep(settings.engine_announce_retry_delay)
@@ -135,19 +153,21 @@ async def announce():
         await service_service.graceful_shutdown(my_service, engine_url)
 
 
-# TODO: 6. CHANGE THE API DESCRIPTION AND SUMMARY
-api_description = """My service
-bla bla bla...
+api_description = """This service benchmarks a dataset with various models and outputs the results sorted by accuracy.
+In order for the service to work your dataset label column must be called "target".
+Also to improve the results you may want to remove uneccessary columns from the dataset.
+Finally, avoid having multiple empty lines at the end of the file.
 """
-api_summary = """My service
-bla bla bla...
+api_summary = """This service benchmarks a dataset with various models and outputs the results sorted by accuracy.
+In order for the service to work your dataset label column must be called "target".
+Also to improve the results you may want to remove uneccessary columns from the dataset.
+Finally, avoid having multiple empty lines at the end of the file.
 """
 
 # Define the FastAPI application with information
-# TODO: 7. CHANGE THE API TITLE, VERSION, CONTACT AND LICENSE
 app = FastAPI(
     lifespan=lifespan,
-    title="Sample Service API.",
+    title="Regression benchmark API.",
     description=api_description,
     version="0.0.1",
     contact={
diff --git a/src/test_data/boston_house_prices.csv b/src/test_data/boston_house_prices.csv

Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`	`1`	`common-code[test] @ git+https://github.com/swiss-ai-center/common-code.git@main`
	`2`	`+lazypredict-nightly==0.3.0`