From b86fe77fe37b2650d6dd71c783037e60fc238602 Mon Sep 17 00:00:00 2001 From: tamirmich Date: Mon, 6 Feb 2023 13:34:22 +0200 Subject: [PATCH] fluentd monitoring (#85) * added prometheus plugin * added templates for monitoring fluentd --- Gemfile | 1 + README.md | 19 +- configmap-monitoring.yaml | 399 ++++++++++++++++++++ logzio-daemonset-containerd-monitoring.yaml | 134 +++++++ logzio-daemonset-rbac-monitoring.yaml | 134 +++++++ 5 files changed, 685 insertions(+), 2 deletions(-) create mode 100644 configmap-monitoring.yaml create mode 100644 logzio-daemonset-containerd-monitoring.yaml create mode 100644 logzio-daemonset-rbac-monitoring.yaml diff --git a/Gemfile b/Gemfile index aa89bc9..87c2b72 100644 --- a/Gemfile +++ b/Gemfile @@ -3,3 +3,4 @@ source "https://rubygems.org" gem "fluent-plugin-logzio", "0.0.22" gem "fluent-plugin-kubernetes_metadata_filter", ">=3.1.2" gem "fluent-plugin-dedot_filter", ">=1.0.0" +gem "fluent-plugin-prometheus", ">=2.0.3" \ No newline at end of file diff --git a/README.md b/README.md index ce190b6..b97a505 100755 --- a/README.md +++ b/README.md @@ -67,6 +67,16 @@ For container runtime Containerd: kubectl apply -f https://raw.githubusercontent.com/logzio/logzio-k8s/master/logzio-daemonset-containerd.yaml -f https://raw.githubusercontent.com/logzio/logzio-k8s/master/configmap.yaml ``` +For monitoring fluentd with runtime Containerd: +```shell +kubectl apply -f https://raw.githubusercontent.com/logzio/logzio-k8s/master/logzio-daemonset-containerd-monitoring.yaml -f https://raw.githubusercontent.com/logzio/logzio-k8s/master/configmap-monitoring.yaml +``` + +For monitoring fluentd with runtime Docker: +```shell +kubectl apply -f https://raw.githubusercontent.com/logzio/logzio-k8s/master/logzio-daemonset-rbac-monitoring.yaml -f https://raw.githubusercontent.com/logzio/logzio-k8s/master/configmap-monitoring.yaml +``` + #### 4. Check Logz.io for your logs Give your logs some time to get from your system to ours, and then open [Kibana](https://app.logz.io/#/dashboard/kibana). @@ -195,14 +205,19 @@ See the [troubleshooting document](https://github.com/logzio/logzio-k8s/blob/mas To suppress Fluentd system messages, set the `FLUENTD_SYSTEMD_CONF` environment variable to `disable` in your Kubernetes environment. -### Disable prometheus input plugins +### Enable prometheus monitoring -By default, latest images launch `prometheus` plugins to monitor fluentd. You can disable prometheus input plugin by setting `disable` to `FLUENTD_PROMETHEUS_CONF` environment variable in your kubernetes configuration. +In order to monitor fluentd and collect input & output metrics. You can +deploy `logzio-daemonset-containerd-monitoring` or `logzio-daemonset-rbac-monitoring`, and `configmap-monitoring.yaml`. +These templates collects and exposes fluentd metrics on port `24231`, `/metrics` endpoint. The templates contains annotations to easly ship when using promehteus shipper. ### Changelog **logzio/logzio-fluentd**: +- v1.3.1: + - Added `fluent-plugin-prometheus`. + - Added `logzio-daemonset-containerd-monitoring`, `logzio-daemonset-rbac-monitoring` and `configmap-monitoring.yaml` which exposes fluentd metrics on the pods port `24231`, `/metrics` endpoint - v1.3.0: - Added plugin `fluent-plugin-dedot_filter`. - Updated image in daemonset templates, increased memory and cpu requirements. diff --git a/configmap-monitoring.yaml b/configmap-monitoring.yaml new file mode 100644 index 0000000..b95241d --- /dev/null +++ b/configmap-monitoring.yaml @@ -0,0 +1,399 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: fluentd-config-monitoring + namespace: monitoring + labels: + k8s-app: fluentd-logzio +data: + fluent.conf: | + @include "#{ENV['FLUENTD_SYSTEMD_CONF'] || 'systemd'}.conf" + @include prometheus.conf + @include kubernetes.conf + @include system.conf + @include conf.d/*.conf + + + @type logzio_buffered + @id out_logzio + endpoint_url "#{ENV['LOGZIO_LOG_LISTENER']}?token=#{ENV['LOGZIO_LOG_SHIPPING_TOKEN']}" + output_include_time true + output_include_tags true + + # Set the buffer type to file to improve the reliability and reduce the memory consumption + @type "#{ENV['LOGZIO_BUFFER_TYPE']}" + path "#{ENV['LOGZIO_BUFFER_PATH']}" + # Set queue_full action to block because we want to pause gracefully + # in case of the off-the-limits load instead of throwing an exception + overflow_action "#{ENV['LOGZIO_OVERFLOW_ACTION']}" + # Set the chunk limit conservatively to avoid exceeding the GCL limit + # of 10MiB per write request. + chunk_limit_size "#{ENV['LOGZIO_CHUNK_LIMIT_SIZE']}" + # Cap the combined memory usage of this buffer and the one below to + # 2MiB/chunk * (6 + 2) chunks = 16 MiB + queue_limit_length "#{ENV['LOGZIO_QUEUE_LIMIT_LENGTH']}" + # Never wait more than 5 seconds before flushing logs in the non-error case. + flush_interval "#{ENV['LOGZIO_FLUSH_INTERVAL']}" + # Never wait longer than 30 seconds between retries. + retry_max_interval "#{ENV['LOGZIO_RETRY_MAX_INTERVAL']}" + # Disable the limit on the number of retries (retry forever). + retry_forever "#{ENV['LOGZIO_RETRY_FOREVER']}" + # Use multiple threads for processing. + flush_thread_count "#{ENV['LOGZIO_FLUSH_THREAD_COUNT']}" + + + kubernetes.conf: | + + + + @type tail + @id in_tail_container_logs + path /var/log/containers/*.log + pos_file /var/log/fluentd-containers.log.pos + exclude_path /var/log/containers/fluentd*.log + tag logzio.kubernetes.* + read_from_head true + + @type multi_format + + # for docker cri + format json + time_key time + time_format %Y-%m-%dT%H:%M:%S.%NZ + keep_time_key true + + + # for containerd cri + # format /^(? + + + + + @type tail + @id in_tail_minion + path /var/log/salt/minion + pos_file /var/log/fluentd-salt.pos + tag logzio.salt + + @type regexp + expression /^(? + + + + @type tail + @id in_tail_startupscript + path /var/log/startupscript.log + pos_file /var/log/fluentd-startupscript.log.pos + tag logzio.startupscript + + @type syslog + + + + + @type tail + @id in_tail_docker + path /var/log/docker.log + pos_file /var/log/fluentd-docker.log.pos + tag logzio.docker + + @type regexp + expression /^time="(? + + + + @type tail + @id in_tail_etcd + path /var/log/etcd.log + pos_file /var/log/fluentd-etcd.log.pos + tag logzio.etcd + + @type none + + + + + @type tail + @id in_tail_kubelet + multiline_flush_interval 5s + path /var/log/kubelet.log + pos_file /var/log/fluentd-kubelet.log.pos + tag logzio.kubelet + + @type kubernetes + + + + + @type tail + @id in_tail_kube_proxy + multiline_flush_interval 5s + path /var/log/kube-proxy.log + pos_file /var/log/fluentd-kube-proxy.log.pos + tag logzio.kube-proxy + + @type kubernetes + + + + + @type tail + @id in_tail_kube_apiserver + multiline_flush_interval 5s + path /var/log/kube-apiserver.log + pos_file /var/log/fluentd-kube-apiserver.log.pos + tag logzio.kube-apiserver + + @type kubernetes + + + + + @type tail + @id in_tail_kube_controller_manager + multiline_flush_interval 5s + path /var/log/kube-controller-manager.log + pos_file /var/log/fluentd-kube-controller-manager.log.pos + tag logzio.kube-controller-manager + + @type kubernetes + + + + + @type tail + @id in_tail_kube_scheduler + multiline_flush_interval 5s + path /var/log/kube-scheduler.log + pos_file /var/log/fluentd-kube-scheduler.log.pos + tag logzio.kube-scheduler + + @type kubernetes + + + + + @type tail + @id in_tail_rescheduler + multiline_flush_interval 5s + path /var/log/rescheduler.log + pos_file /var/log/fluentd-rescheduler.log.pos + tag logzio.rescheduler + + @type kubernetes + + + + + @type tail + @id in_tail_glbc + multiline_flush_interval 5s + path /var/log/glbc.log + pos_file /var/log/fluentd-glbc.log.pos + tag logzio.glbc + + @type kubernetes + + + + + @type tail + @id in_tail_cluster_autoscaler + multiline_flush_interval 5s + path /var/log/cluster-autoscaler.log + pos_file /var/log/fluentd-cluster-autoscaler.log.pos + tag logzio.cluster-autoscaler + + @type kubernetes + + + + @include "#{ENV['AUDIT_LOG_FORMAT'] || 'audit'}.conf" + + # This handles multiline exceptions automatically: https://github.com/GoogleCloudPlatform/fluent-plugin-detect-exceptions + + @type detect_exceptions + remove_tag_prefix logzio + message log + languages all + multiline_flush_interval 0.1 + + + @include "partial-#{ENV['CRI']}.conf" + + # This adds type to the log && change key log to message + + @type record_modifier + + type k8s + message ${record["log"]} + + remove_keys log + + + + @type kubernetes_metadata + @id filter_kube_metadata + kubernetes_url "#{ENV['FLUENT_FILTER_KUBERNETES_URL'] || 'https://' + ENV.fetch('KUBERNETES_SERVICE_HOST') + ':' + ENV.fetch('KUBERNETES_SERVICE_PORT') + '/api'}" + verify_ssl "#{ENV['KUBERNETES_VERIFY_SSL'] || true}" + + + @type dedot + de_dot true + de_dot_separator _ + de_dot_nested true + + + system.conf: | + + log_level "#{ENV['LOGZIO_LOG_LEVEL']}" + + + systemd.conf: | + # Logs from systemd-journal for interesting services. + + @type systemd + @id in_systemd_kubelet + filters [{ "_SYSTEMD_UNIT": "kubelet.service" }] + + @type local + persistent true + path /var/log/fluentd-journald-kubelet-cursor.json + + read_from_head true + tag kubelet + + + # Logs from docker-systemd + + @type systemd + @id in_systemd_docker + filters [{ "_SYSTEMD_UNIT": "docker.service" }] + + @type local + persistent true + path /var/log/fluentd-journald-docker-cursor.json + + read_from_head true + tag docker.systemd + + + # Logs from systemd-journal for interesting services. + + @type systemd + @id in_systemd_bootkube + filters [{ "_SYSTEMD_UNIT": "bootkube.service" }] + + @type local + persistent true + path /var/log/fluentd-journald-bootkube-cursor.json + + read_from_head true + tag bootkube + + + audit.conf: | + # Example: + # 2017-02-09T00:15:57.992775796Z AUDIT: id="90c73c7c-97d6-4b65-9461-f94606ff825f" ip="104.132.1.72" method="GET" user="kubecfg" as="" asgroups="" namespace="default" uri="/api/v1/namespaces/default/pods" + # 2017-02-09T00:15:57.993528822Z AUDIT: id="90c73c7c-97d6-4b65-9461-f94606ff825f" response="200" + + @type tail + @id in_tail_kube_apiserver_audit + multiline_flush_interval 5s + path /var/log/kubernetes/kube-apiserver-audit.log + pos_file /var/log/kube-apiserver-audit.log.pos + tag logzio.kube-apiserver-audit + + @type multiline + format_firstline /^\S+\s+AUDIT:/ + # Fields must be explicitly captured by name to be parsed into the record. + # Fields may not always be present, and order may change, so this just looks + # for a list of key="\"quoted\" value" pairs separated by spaces. + # Unknown fields are ignored. + # Note: We can't separate query/response lines as format1/format2 because + # they don't always come one after the other for a given query. + format1 /^(? + + + audit-json.conf: | + + @type tail + @id in_tail_kube_apiserver_audit + multiline_flush_interval 5s + path /var/log/kubernetes/kube-apiserver-audit.log + pos_file /var/log/kube-apiserver-audit.log.pos + tag logzio.kube-apiserver-audit + + @type json + keep_time_key true + time_key timestamp + time_format %Y-%m-%dT%T.%L%Z + + + + partial-docker.conf: | + # Concat docker cri partial log + # https://github.com/fluent-plugins-nursery/fluent-plugin-concat + # https://github.com/moby/moby/issues/34620#issuecomment-619369707 + + @type concat + key log + use_first_timestamp true + multiline_end_regexp /\n$/ + separator "" + + + partial-containerd.conf: | + # Concat containerd cri partial log + # https://github.com/fluent/fluentd-kubernetes-daemonset/issues/412#issuecomment-636536767 + + @type concat + key log + use_first_timestamp true + partial_key logtag + partial_value P + separator "" + + prometheus.conf: | + + @type prometheus + @id in_prometheus + bind "0.0.0.0" + port 24231 + metrics_path "/metrics" + + + @type prometheus + + name fluentd_input_status_num_records_total + type counter + desc The total number of incoming records + + tag ${tag} + hostname ${hostname} + + + + + @type prometheus_monitor + @id in_prometheus_monitor + + + @type prometheus_output_monitor + @id in_prometheus_output_monitor + + diff --git a/logzio-daemonset-containerd-monitoring.yaml b/logzio-daemonset-containerd-monitoring.yaml new file mode 100644 index 0000000..d8d5609 --- /dev/null +++ b/logzio-daemonset-containerd-monitoring.yaml @@ -0,0 +1,134 @@ +# Exposes fluentd metrics on port 24231, /metrics. +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: fluentd + namespace: monitoring + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: fluentd + namespace: monitoring +rules: +- apiGroups: + - "" + resources: + - pods + - namespaces + verbs: + - get + - list + - watch + +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: fluentd +roleRef: + kind: ClusterRole + name: fluentd + apiGroup: rbac.authorization.k8s.io +subjects: +- kind: ServiceAccount + name: fluentd + namespace: monitoring +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: fluentd-logzio + namespace: monitoring + labels: + k8s-app: fluentd-logzio + version: v1 +spec: + selector: + matchLabels: + k8s-app: fluentd-logzio + template: + metadata: + labels: + k8s-app: fluentd-logzio + version: v1 + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "24231" + spec: + serviceAccount: fluentd + serviceAccountName: fluentd + tolerations: + - key: node-role.kubernetes.io/master + effect: NoSchedule + # Because the image's entrypoint requires to write on /fluentd/etc but we mount configmap there which is read-only, + # this initContainers workaround or other is needed. + # See https://github.com/fluent/fluentd-kubernetes-daemonset/issues/90 + initContainers: + - name: copy-fluentd-config + image: busybox + command: ['sh', '-c', 'cp /config-volume/..data/* /fluentd/etc'] + volumeMounts: + - name: config-volume + mountPath: /config-volume + - name: fluentdconf + mountPath: /fluentd/etc + containers: + - name: fluentd + image: logzio/logzio-fluentd:1.3.1 + ports: + - name: metrics + containerPort: 24231 + protocol: TCP + env: + - name: LOGZIO_LOG_SHIPPING_TOKEN + valueFrom: + secretKeyRef: + name: logzio-logs-secret + key: logzio-log-shipping-token + - name: LOGZIO_LOG_LISTENER + valueFrom: + secretKeyRef: + name: logzio-logs-secret + key: logzio-log-listener + - name: FLUENTD_SYSTEMD_CONF + value: "disable" + - name: INCLUDE_NAMESPACE + value: "" + - name: KUBERNETES_VERIFY_SSL + value: "true" + - name: AUDIT_LOG_FORMAT + value: audit + - name: CRI + value: "containerd" + resources: + limits: + memory: 500Mi + requests: + cpu: 200m + memory: 500Mi + volumeMounts: + - name: varlog + mountPath: /var/log + - name: varlibdockercontainers + mountPath: /var/lib/docker/containers + readOnly: true + - name: config-volume + mountPath: /config-volume + - name: fluentdconf + mountPath: /fluentd/etc + terminationGracePeriodSeconds: 30 + volumes: + - name: varlog + hostPath: + path: /var/log + - name: varlibdockercontainers + hostPath: + path: /var/lib/docker/containers + - name: config-volume + configMap: + name: fluentd-config-monitoring + - name: fluentdconf + emptyDir: {} diff --git a/logzio-daemonset-rbac-monitoring.yaml b/logzio-daemonset-rbac-monitoring.yaml new file mode 100644 index 0000000..61496cf --- /dev/null +++ b/logzio-daemonset-rbac-monitoring.yaml @@ -0,0 +1,134 @@ +# Exposes fluentd metrics on port 24231, /metrics. +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: fluentd + namespace: monitoring + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: fluentd + namespace: monitoring +rules: +- apiGroups: + - "" + resources: + - pods + - namespaces + verbs: + - get + - list + - watch + +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: fluentd +roleRef: + kind: ClusterRole + name: fluentd + apiGroup: rbac.authorization.k8s.io +subjects: +- kind: ServiceAccount + name: fluentd + namespace: monitoring +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: fluentd-logzio + namespace: monitoring + labels: + k8s-app: fluentd-logzio + version: v1 +spec: + selector: + matchLabels: + k8s-app: fluentd-logzio + template: + metadata: + labels: + k8s-app: fluentd-logzio + version: v1 + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "24231" + spec: + serviceAccount: fluentd + serviceAccountName: fluentd + tolerations: + - key: node-role.kubernetes.io/master + effect: NoSchedule + # Because the image's entrypoint requires to write on /fluentd/etc but we mount configmap there which is read-only, + # this initContainers workaround or other is needed. + # See https://github.com/fluent/fluentd-kubernetes-daemonset/issues/90 + initContainers: + - name: copy-fluentd-config + image: busybox + command: ['sh', '-c', 'cp /config-volume/..data/* /fluentd/etc'] + volumeMounts: + - name: config-volume + mountPath: /config-volume + - name: fluentdconf + mountPath: /fluentd/etc + containers: + - name: fluentd + image: logzio/logzio-fluentd:1.3.1 + ports: + - name: metrics + containerPort: 24231 + protocol: TCP + env: + - name: LOGZIO_LOG_SHIPPING_TOKEN + valueFrom: + secretKeyRef: + name: logzio-logs-secret + key: logzio-log-shipping-token + - name: LOGZIO_LOG_LISTENER + valueFrom: + secretKeyRef: + name: logzio-logs-secret + key: logzio-log-listener + - name: FLUENTD_SYSTEMD_CONF + value: "disable" + - name: INCLUDE_NAMESPACE + value: "" + - name: KUBERNETES_VERIFY_SSL + value: "true" + - name: AUDIT_LOG_FORMAT + value: audit + - name: "CRI" + value: "docker" + resources: + limits: + memory: 500Mi + requests: + cpu: 200m + memory: 500Mi + volumeMounts: + - name: varlog + mountPath: /var/log + - name: varlibdockercontainers + mountPath: /var/lib/docker/containers + readOnly: true + - name: config-volume + mountPath: /config-volume + - name: fluentdconf + mountPath: /fluentd/etc + terminationGracePeriodSeconds: 30 + volumes: + - name: varlog + hostPath: + path: /var/log + - name: varlibdockercontainers + hostPath: + path: /var/lib/docker/containers + - name: config-volume + configMap: + name: fluentd-config-monitoring + - name: fluentdconf + emptyDir: {}