stefanDeveloper · stefanDeveloper · Nov 5, 2024 · Oct 1, 2024 · Oct 1, 2024 · Oct 14, 2024
diff --git a/config.yaml b/config.yaml
@@ -59,8 +59,8 @@ pipeline:
 
   data_analysis:
     detector:
-      model: xg # XGBoost
-      checksum: 21d1f40c9e186a08e9d2b400cea607f4163b39d187a9f9eca3da502b21cf3b9b
+      model: rf # XGBoost
+      checksum: ba1f718179191348fe2abd51644d76191d42a5d967c6844feb3371b6f798bf06
       base_url: https://heibox.uni-heidelberg.de/d/0d5cbcbe16cd46a58021/
       threshold: 0.5
 

diff --git a/docker/docker-compose.external.yml b/docker/docker-compose.external.yml
@@ -0,0 +1,146 @@
+services:
+
+  logcollector:
+    build:
+      context: ..
+      dockerfile: docker/dockerfiles/Dockerfile.logcollector
+      network: host
+    restart: "unless-stopped"
+    depends_on:
+      logserver:
+        condition: service_started
+    networks:
+      heidgaf:
+        ipv4_address: 172.27.0.7
+    volumes:
+      - /home/smachmeier/projects/heiDGAF/config.yaml:/usr/src/app/config.yaml
+    memswap_limit: 768m
+    deploy:
+      resources:
+        limits:
+          cpus: '2'
+          memory: 512m
+        reservations:
+          cpus: '1'
+          memory: 256m
+
+  logserver:
+    build:
+      context: ..
+      dockerfile: docker/dockerfiles/Dockerfile.logserver
+      network: host
+    restart: "unless-stopped"
+    ports:
+      - 9998:9998
+    networks:
+      heidgaf:
+        ipv4_address: 172.27.0.8
+    memswap_limit: 768m
+    deploy:
+      resources:
+        limits:
+          cpus: '2'
+          memory: 512m
+        reservations:
+          cpus: '1'
+          memory: 256m
+    volumes:
+      - "${MOUNT_PATH:?MOUNT_PATH not set}:/opt/file.txt"
+      - /home/smachmeier/projects/heiDGAF/config.yaml:/usr/src/app/config.yaml
+
+
+  inspector:
+    build:
+      context: ..
+      dockerfile: docker/dockerfiles/Dockerfile.inspector
+      network: host
+    restart: "unless-stopped"
+    depends_on:
+      logserver:
+        condition: service_started
+      prefilter:
+        condition: service_started
+      logcollector:
+        condition: service_started
+    networks:
+      heidgaf:
+        ipv4_address: 172.27.0.6
+    volumes:
+      - /home/smachmeier/projects/heiDGAF/config.yaml:/usr/src/app/config.yaml
+    deploy:
+      mode: "replicated"
+      replicas: 1
+      resources:
+        limits:
+          cpus: '2'
+          memory: 512m
+        reservations:
+          cpus: '1'
+          memory: 256m
+
+  prefilter:
+    build:
+      context: ..
+      dockerfile: docker/dockerfiles/Dockerfile.prefilter
+      network: host
+    restart: "unless-stopped"
+    depends_on:
+      logcollector:
+        condition: service_started
+      logserver:
+        condition: service_started
+    networks:
+      heidgaf:
+        ipv4_address: 172.27.0.9
+    volumes:
+      - /home/smachmeier/projects/heiDGAF/config.yaml:/usr/src/app/config.yaml
+    deploy:
+      mode: "replicated"
+      replicas: 1
+      resources:
+        limits:
+          cpus: '2'
+          memory: 512m
+        reservations:
+          cpus: '1'
+          memory: 256m
+
+  detector:
+    build:
+      context: ..
+      dockerfile: docker/dockerfiles/Dockerfile.detector
+      network: host
+    restart: "unless-stopped"
+    depends_on:
+      logcollector:
+        condition: service_started
+      logserver:
+        condition: service_started
+    networks:
+      heidgaf:
+        ipv4_address: 172.27.0.10
+    volumes:
+      - /home/smachmeier/projects/heiDGAF/config.yaml:/usr/src/app/config.yaml
+    deploy:
+      mode: "replicated"
+      replicas: 1
+      resources:
+        limits:
+          cpus: '2'
+          memory: 512m
+        reservations:
+          cpus: '1'
+          memory: 256m
+          devices:
+            - driver: nvidia
+              count: 1 # alternatively, use `count: all` for all GPUs
+              capabilities: [gpu]
+
+networks:
+  heidgaf:
+    driver: bridge
+    ipam:
+      driver: default
+      config:
+        - subnet: 172.27.0.0/16
+          gateway: 172.27.0.1
diff --git a/docker/docker-compose.kafka.yml b/docker/docker-compose.kafka.yml
@@ -2,6 +2,7 @@ services:
   zookeeper:
     image: confluentinc/cp-zookeeper:7.3.2
     container_name: zookeeper
+    restart: "unless-stopped"
     networks:
       heidgaf:
         ipv4_address: 172.27.0.2

diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
@@ -126,13 +126,40 @@ services:
           cpus: '1'
           memory: 256m
 
-  # detector:
-  #   build:
-  #     context: ./dockerfiles
-  #     dockerfile: Dockerfile.detector
-  #   deploy:
-  #     mode: "replicated"
-  #     replicas: 6
+  detector:
+    build:
+      context: ..
+      dockerfile: docker/dockerfiles/Dockerfile.detector
+      network: host
+    restart: "unless-stopped"
+    depends_on:
+      kafka1:
+        condition: service_healthy
+      kafka2:
+        condition: service_healthy
+      kafka3:
+        condition: service_healthy
+      logcollector:
+        condition: service_started
+      logserver:
+        condition: service_started
+    networks:
+      heidgaf:
+        ipv4_address: 172.27.0.10
+    deploy:
+      mode: "replicated"
+      replicas: 1
+      resources:
+        limits:
+          cpus: '2'
+          memory: 512m
+        reservations:
+          cpus: '1'
+          memory: 256m
+          devices:
+            - driver: nvidia
+              count: 1 # alternatively, use `count: all` for all GPUs
+              capabilities: [gpu]
 
 networks:
   heidgaf:

diff --git a/docker/dockerfiles/Dockerfile.detector b/docker/dockerfiles/Dockerfile.detector
@@ -1,12 +1,16 @@
-FROM python:3
+FROM python:3.11-slim-bookworm
+
+ENV PYTHONDONTWRITEBYTECODE=1
 
 WORKDIR /usr/src/app
 
 COPY requirements/requirements.detector.txt ./
-RUN pip --disable-pip-version-check install --no-cache-dir --no-compile  -r requirements.detector.txt
+RUN pip --disable-pip-version-check install --no-cache-dir --no-compile -r requirements.detector.txt
 
 COPY src/base ./src/base
 COPY src/detector ./src/detector
 COPY config.yaml .
 
+RUN rm -rf /root/.cache
+
 CMD [ "python", "src/detector/detector.py"]
diff --git a/docker/dockerfiles/Dockerfile.logcollector b/docker/dockerfiles/Dockerfile.logcollector
@@ -1,4 +1,4 @@
-FROM python:3-slim-bookworm
+FROM python:3.11-slim-bookworm
 
 ENV PYTHONDONTWRITEBYTECODE=1
 

diff --git a/docker/dockerfiles/Dockerfile.logserver b/docker/dockerfiles/Dockerfile.logserver
@@ -1,4 +1,4 @@
-FROM python:3-slim-bookworm
+FROM python:3.11-slim-bookworm
 
 ENV PYTHONDONTWRITEBYTECODE=1
 

diff --git a/docker/dockerfiles/Dockerfile.prefilter b/docker/dockerfiles/Dockerfile.prefilter
@@ -1,4 +1,4 @@
-FROM python:3-slim-bookworm
+FROM python:3.11-slim-bookworm
 
 ENV PYTHONDONTWRITEBYTECODE=1
 

diff --git a/docs/pipeline.rst b/docs/pipeline.rst
@@ -375,7 +375,7 @@ Overview
 
 The `Inspector` stage is responsible to run time-series based anomaly detection on prefiltered batches. This stage is essentiell to reduce
 the load on the `Detection` stage.
-Otherwise, resource complexity would increase disproportionately.
+Otherwise, resource complexity increases disproportionately.
 
 Main Class
 ----------
@@ -393,20 +393,17 @@ The :class:`Inspector` loads the StreamAD model to perform anomaly detection.
 It consumes batches on the topic ``inspect``, usually produced by the ``Prefilter``.
 For a new batch, it derives the timestamps ``begin_timestamp`` and ``end_timestamp``.
 Based on time type (e.g. ``s``, ``ms``) and time range (e.g. ``5``) the sliding non-overlapping window is created.
-For univariate time-series, it counts the number of occurances, whereas for multivariate, it considers the packet size. :cite:`schuppen_fanci_2018`
+For univariate time-series, it counts the number of occurances, whereas for multivariate, it considers the number of occurances and packet size. :cite:`schuppen_fanci_2018`
 
-.. note:: TODO Add mathematical explanation.
-
-:math:`y = x`
-
-An anomaly is noted when it is greater than a ``score_threshold``. In addition, we support a relative anomaly threshold.
-So, if the anomaly threshold is ``0.01``, it sends anomalies for further detection, if the amount of anomlies divided by the total amount of requests in the batch is greater.
+An anomaly is noted when it is greater than a ``score_threshold``.
+In addition, we support a relative anomaly threshold.
+So, if the anomaly threshold is ``0.01``, it sends anomalies for further detection, if the amount of anomlies divided by the total amount of requests in the batch is greater than ``0.01``.
 
 Configuration
 -------------
 
 All StreamAD models are supported. This includes univariate, multivariate, and ensemble methods.
-In case special arguments are desired for your environment, the ``model_args`` as a dictionary can be passed for each model.
+In case special arguments are desired for your environment, the ``model_args`` as a dictionary ``dict`` can be passed for each model.
 
 Univariate models in `streamad.model`:
 
@@ -417,7 +414,7 @@ Univariate models in `streamad.model`:
 - :class:`OCSVMDetector`
 
 Multivariate models in `streamad.model`:
-Currently, we rely on the packet size for multivariate processing.
+Currently, we rely on the packet size and number occurances for multivariate processing.
 
 - :class:`xStreamDetector`
 - :class:`RShashDetector`
@@ -439,11 +436,13 @@ Stage 5: Detection
 Overview
 --------
 
-The `Detector` resembles the heart of heiDGAF. It runs pre-trained machine learning models to get a probability outcome of DNS requests.
+The `Detector` resembles the heart of heiDGAF. It runs pre-trained machine learning models to get a probability outcome for the DNS requests.
 The pre-trained models are under the EUPL-1.2 license online available.
 In total, we rely on the following data sets for the pre-trained models we offer:
 
 - `CIC-Bell-DNS-2021 <https://www.unb.ca/cic/datasets/dns-2021.html>`_
+- `DGTA-BENCH - Domain Generation and Tunneling Algorithms for Benchmark <https://data.mendeley.com/datasets/2wzf9bz7xr/1>`_
+- `DGArchive <https://dgarchive.caad.fkie.fraunhofer.de/>`_
 
 Main Class
 ----------
@@ -456,7 +455,7 @@ Usage
 
 The :class:`Detector` consumes anomalous batches of requests.
 It calculates a probability score for each request, and at last, an overall score of the batch.
-Such alerts are log to ``/tmp/warnings.json``.
+Alerts are log to ``/tmp/warnings.json``.
 
 Configuration
 -------------

diff --git a/docs/training.rst b/docs/training.rst
@@ -3,3 +3,17 @@ Training
 
 Overview
 ========
+
+In total, we support ``RandomForest``, and ``XGBoost``.
+The :class:`DetectorTraining` resembles the main function to fit any model.
+After initialisation,
+
+It supports various data sets:
+
+- ``all``: Includes all available data sets
+- ``cic``: Train on the CICBellDNS2021 data set
+- ``dgta``: Train on the DTGA Benchmarking data set
+- ``dgarchive``: Train on the DGArchive data set
+
+For hyperparameter optimisation we use ``optuna``.
+It offers GPU support to get the best parameters.
diff --git a/docs/usage.rst b/docs/usage.rst
@@ -17,6 +17,14 @@ If you want to use heiDGAF, just use the provided ``docker-compose.yml`` to quic
 
    $ docker compose -f docker/docker-compose.yml up
 
+Run container individually:
+
+
+.. code-block:: console
+
+   $ docker compose -f docker/docker-compose.kafka.yml up
+   $ docker run ...
+
 Installation
 ------------
 

diff --git a/requirements/requirements.detector.txt b/requirements/requirements.detector.txt
@@ -1,6 +1,8 @@
-joblib
 xgboost
-marshmallow_dataclass~=8.7.1
+scikit-learn~=1.5.2
 requests
-confluent-kafka~=2.4.0
 colorlog~=6.8.2
+PyYAML~=6.0.1
+colorlog~=6.8.2
+confluent-kafka~=2.4.0
+marshmallow_dataclass~=8.7.1
diff --git a/requirements/requirements.train.txt b/requirements/requirements.train.txt
@@ -1,5 +1,7 @@
 numpy
-polars
-torch
 xgboost
-scikit-learn
+scikit-learn~=1.5.2
+scipy
+torch
+pyarrow
+polars
diff --git a/src/base/__init__.py b/src/base/__init__.py
@@ -1,6 +1,5 @@
-from typing import Optional, List, Dict
+from typing import List
 from dataclasses import dataclass, field
-import marshmallow_dataclass
 import marshmallow.validate
 import datetime