diff --git a/website/en/documents/components/metrics/prometheus-metric.md b/website/en/documents/components/metrics/prometheus-metric.md new file mode 100644 index 000000000..40c61fc50 --- /dev/null +++ b/website/en/documents/components/metrics/prometheus-metric.md @@ -0,0 +1,800 @@ +# bitsail-metrics-prometheus + +----- + +Parent document: [bitsail-components](../README.md) + +## BitSail Metrics Introduction + +Metric monitoring can report the running status of the current data integration task in real time, and it is convenient for users to locate the problem when the job fails. BitSail povides a Metric collection module that can be used by users to support the Prometheus monitoring system. Users can monitor various indicators when BitSail is running by customizing the Grafana dashboard. + +![img](../../../../images/documents/components/metrics/prometheus/metric_manager_en.png) + +- MetricManager is mainly responsible for the management of MetricGroup, and obtains MetricReporter through `MetricReporterFactory`. +- Scheduled is used to define the time interval for Reporter to report monitoring metrics. +- MetricReporter is used to report the metric to the external backend for processing + +![img](../../../../images/documents/components/metrics/prometheus/metric_report_diagram.png) + +To report metrics for different service monitoring systems, you only need to obtain the corresponding MetricReporterBuilder according to METRICS_REPORTER_TYPE in CommonOptions to obtain the corresponding MetricReporter. + +### Metrics Type + +BitSail's Metrics is implemented by introducing the `com.codahale.metrics` package, and the collected metrics are divided into: Counter, Gauge, Histogram, Meter, Timer: + +- Meter is mainly used to count the rate of an event in the system, which can reflect the current processing capacity of the system and help us judge whether resources are insufficient. It is very convenient to help us count the number of requests per second (TPS), the number of queries per second (QPS), the average number of requests per second in the last 1 minute, the average number of requests per second in the last 5 minutes, the average number of requests per second in the last 15 minutes, etc. . +- Guage is the simplest metric, only a simple return value, usually used to record the instantaneous value of some objects or things. Gauge can be used to complete custom measurement types, which can be used to measure the number of tasks in a pending queue, as well as the current memory usage and other scenarios. +- Counter is a cumulative metric, which encapsulates AtomicLong with Gauge internally. It is mainly used to count the total number of jobs in the queue; the number of errors; the number of service requests and other scenarios. +- Histogram is a measure of the distribution of statistical data, providing minimum, maximum, median, and median, 75th percentile, 90th percentile, 95th percentile, 98th percentile, 99th percentile percentile, and 99.9 percentile values. Scenarios used, such as statistical traffic maximum, minimum, average, median, etc. +- The essence of Timer is the combination of Histogram and Meter, which can conveniently count the request rate and processing time, such as disk read delay statistics, interface call delay and other information statistics and other scenarios. + +Example applications of BitSail: + +- Use CallTracer to count the call throughput and execution delay of the code in *try* + +![img](../../../../images/documents/components/metrics/prometheus/call_tracer.png) + +- Use the Timer class metric to count the qps and execution time of the function +- Use the Counter class metric to count the throughput + +- Count the number of bytes of successfully written and read records and the number of failed records in DelegateFlinkWriter and DelegateSourcePipeline. + +## Prometheus and BitSai Integration + +Regarding the installation and use of Prometheus and Grafana, I won't go into details here. Under normal circumstances, Prometheus uses the pull mode to pull monitoring data from jobs or exporters that generate metrics (such as NodeExporter that monitors hosts). BitSail supports the pull mode of Prometheus through PrometheusMetricReporter, and the push mode of Prometheus through PrometheusPushGatewayReporter. + +Prometheus adopts the pull mode, and it may be that Prometheus cannot directly pull each target data because it is not in a subnet or a firewall. So here we show the use of PushGateway mode. PushGateway is a transit component. By configuring the BitSail job, the metric is pushed to PushGateway, and Prometheus pulls it from PushGateway. Pushgateway can persist all monitoring data pushed to it. Therefore, even if your monitoring is offline, prometheus can still pull the old monitoring data. + +### common configuration + +| Param name | Required | Optional value | Description | +| --------------------- | -------- | ---------------------------------------------------- | -------------------------------------------------------- | +| metrics_reporter_type | No | "prometheus"、"prometheus_pushgateway"、"log"、"nop" | The type of metrics reporter, the default value is "log" | + +### metric configuration + +#### PrometheusMetricReporter + +| Param name | Required | Default value | Description | +| --------------- | -------- | ------------- | ----------------------------------------------------------- | +| prometheus_host | No | 9999 | The port number that bitsail pushes to the prometheu server | + +#### PrometheusPushGatewayReporter + +| Param name | Required | Default value | Description | +| ---------------------------------- | -------- | ------------- | ------------------------------------------------------------ | +| pushgateway_host | No | localhost | host of prometheus pushgateway | +| pushgateway_port | No | 9091 | port of prometheus pushgateway | +| pushgateway_https_port | No | 443 | HTTPS port number | +| pushgateway_report_period_seconds | No | 1 | Time interval for pushing data to prometheus pushgateway | +| pushgateway_delete_on_shutdown | No | TRUE | Whether to delete the metric data when the task ends | +| pushgateway_jobname | No | | The task name of prometheus pushgateway | +| pushgateway_default_jobName_suffix | No | _metric | When the task name of prometheus pushgateway is empty, the suffix will be spliced with the bitsail job task name as the default task name. | + +#### Example + +```Bash +{ + "job": { + "common": { + "job_id": 313, + "instance_id": 3123, + "job_name": "bitsail_hadoop_to_print_test", + "user_name": "root", + "metrics_reporter_type": "prometheus_pushgateway", + "metric": { + "pushgateway_delete_on_shutdown": "false" + } + } +} +``` + +Start the prometheus pushgateway service through the container. + +```Bash +docker run -d -p 9091:9091 \ +-v "/etc/localtime:/etc/localtime" \ +prom/pushgateway +``` + +Before starting pushgateway, you need to modify the Prometheus configuration file prometheus.yml and restart the service to monitor PushGateway. + +```Bash +global: + scrape_interval: 15s + evaluation_interval: 60s + external_labels: + monitor: codelab-monitor +scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - 'localhost:9090' + # 添加PushGateway 监控配置 + - job_name: 'pushgateway' + static_configs: + - targets: ['localhost:9091'] +``` + +### Verification and Testing + +![img](../../../../images/documents/components/metrics/prometheus/pushgateway_metric.png) + +## Grafana dashboard and Prometheus integration + +![img](../../../../images/documents/components/metrics/prometheus/bitsail_dashboard.png) + +### Custom Dashboard Processes + +#### Add Prometheus data source through Add Data Source + +![img](../../../../images/documents/components/metrics/prometheus/add_data_source.png) + +#### Customize the BitSail Metric dashboard through Add Panel + +![img](../../../../images/documents/components/metrics/prometheus/add_panel.png) + +### Grafana configuration template + +```json +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 3, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "yPH0bsh4z" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 12, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "yPH0bsh4z" + }, + "editorMode": "builder", + "expr": "avg(record_flow_control_latency{job=\"$pushgateway_job_name\", exported_job=\"$job\", exported_instance=\"$instance\", quantile=\"0.999\"})", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "读取、写入流控延迟(ms)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "yPH0bsh4z" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "yPH0bsh4z" + }, + "editorMode": "builder", + "expr": "avg(record_invoke_latency{job=\"$pushgateway_job_name\", exported_job=\"$job\", exported_instance=\"$instance\", quantile=\"0.999\"})", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "读取、写入延迟 (ms)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "yPH0bsh4z" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "yPH0bsh4z" + }, + "editorMode": "builder", + "expr": "avg(max(record_flow_control_latency_count{job=\"$pushgateway_job_name\", exported_job=\"$job\", exported_instance=\"$instance\"}))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Record Success Count (Records/s)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "yPH0bsh4z" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "yPH0bsh4z" + }, + "editorMode": "code", + "expr": "max(record_failed_count{job=\"$pushgateway_job_name\", exported_job=\"$job\", exported_instance=\"$instance\"})", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Record Failed Count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "yPH0bsh4z" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "yPH0bsh4z" + }, + "editorMode": "builder", + "expr": "max(record_success_count{job=\"$pushgateway_job_name\", exported_job=\"$job\", exported_instance=\"$instance\"})", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Record Success Count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "yPH0bsh4z" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "yPH0bsh4z" + }, + "editorMode": "code", + "expr": "max(record_success_bytes{job=\"$pushgateway_job_name\", exported_job=\"$job\", exported_instance=\"$instance\"})", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Record Success Bytes", + "type": "timeseries" + } + ], + "refresh": "5s", + "revision": 1, + "schemaVersion": 38, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "pushgateway", + "value": "pushgateway" + }, + "hide": 0, + "name": "pushgateway_job_name", + "options": [ + { + "selected": true, + "text": "pushgateway", + "value": "pushgateway" + } + ], + "query": "pushgateway", + "skipUrlSync": false, + "type": "textbox" + }, + { + "current": { + "selected": false, + "text": "bitsail_hadoop_to_print_test_metric", + "value": "bitsail_hadoop_to_print_test_metric" + }, + "hide": 0, + "name": "job", + "options": [ + { + "selected": true, + "text": "bitsail_hadoop_to_print_test_metric", + "value": "bitsail_hadoop_to_print_test_metric" + } + ], + "query": "bitsail_hadoop_to_print_test_metric", + "skipUrlSync": false, + "type": "textbox" + }, + { + "current": { + "selected": false, + "text": "3123", + "value": "3123" + }, + "hide": 0, + "name": "instance", + "options": [ + { + "selected": true, + "text": "3123", + "value": "3123" + } + ], + "query": "3123", + "skipUrlSync": false, + "type": "textbox" + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "BitSail", + "uid": "Nf9bXCb4k", + "version": 1, + "weekStart": "" +} +``` \ No newline at end of file diff --git a/website/images/documents/components/metrics/prometheus/add_data_source.png b/website/images/documents/components/metrics/prometheus/add_data_source.png new file mode 100644 index 000000000..6c29537d7 Binary files /dev/null and b/website/images/documents/components/metrics/prometheus/add_data_source.png differ diff --git a/website/images/documents/components/metrics/prometheus/add_panel.png b/website/images/documents/components/metrics/prometheus/add_panel.png new file mode 100644 index 000000000..c33dc1976 Binary files /dev/null and b/website/images/documents/components/metrics/prometheus/add_panel.png differ diff --git a/website/images/documents/components/metrics/prometheus/bitsail_dashboard.png b/website/images/documents/components/metrics/prometheus/bitsail_dashboard.png new file mode 100644 index 000000000..10b8f48ce Binary files /dev/null and b/website/images/documents/components/metrics/prometheus/bitsail_dashboard.png differ diff --git a/website/images/documents/components/metrics/prometheus/call_tracer.png b/website/images/documents/components/metrics/prometheus/call_tracer.png new file mode 100644 index 000000000..6744e29d7 Binary files /dev/null and b/website/images/documents/components/metrics/prometheus/call_tracer.png differ diff --git a/website/images/documents/components/metrics/prometheus/metric_manager_en.png b/website/images/documents/components/metrics/prometheus/metric_manager_en.png new file mode 100644 index 000000000..6d318536d Binary files /dev/null and b/website/images/documents/components/metrics/prometheus/metric_manager_en.png differ diff --git a/website/images/documents/components/metrics/prometheus/metric_manager_zh.png b/website/images/documents/components/metrics/prometheus/metric_manager_zh.png new file mode 100644 index 000000000..1a81cfbf3 Binary files /dev/null and b/website/images/documents/components/metrics/prometheus/metric_manager_zh.png differ diff --git a/website/images/documents/components/metrics/prometheus/metric_report_diagram.png b/website/images/documents/components/metrics/prometheus/metric_report_diagram.png new file mode 100644 index 000000000..955ad583c Binary files /dev/null and b/website/images/documents/components/metrics/prometheus/metric_report_diagram.png differ diff --git a/website/images/documents/components/metrics/prometheus/pushgateway_metric.png b/website/images/documents/components/metrics/prometheus/pushgateway_metric.png new file mode 100644 index 000000000..ca85d1625 Binary files /dev/null and b/website/images/documents/components/metrics/prometheus/pushgateway_metric.png differ diff --git a/website/zh/documents/components/metrics/prometheus-metric.md b/website/zh/documents/components/metrics/prometheus-metric.md new file mode 100644 index 000000000..9c81eee69 --- /dev/null +++ b/website/zh/documents/components/metrics/prometheus-metric.md @@ -0,0 +1,798 @@ +# bitsail-metrics-prometheus + +----- + +上级文档: [bitsail-components](../README.md) + +## BitSail Metrics介绍 + +Metric监控能实时汇报目前数据集成任务运行的状态,在作业故障时候方便使用者定位到问题所在。BitSail提供一个可供用户使用的Metric收集模块,实现了对Prometheus监控系统的支持,用户可以通过自定义Grafana仪表盘实现对BitSail运行时各类指标的监控。 + +![img](../../../../images/documents/components/metrics/prometheus/metric_manager_zh.png) + +- MetricManager主要负责MetricGroup的管理,通过`MetricReporterFactory`来获取MetricReporter。 +- Scheduled用来定义Reporter汇报监metric的时间间隔。 +- MetricReporter用来将metric汇报给外部的backend处理 + +![img](../../../../images/documents/components/metrics/prometheus/metric_report_diagram.png) + +对不同的服务监控系统进行metrics的汇报,只需要根据CommonOptions中的METRICS_REPORTER_TYPE获取相应的MetricReporterBuilder,获取对应的MetricReporter。 + +### 监控指标 + +BitSail的Metrics是通过引入`com.codahale.metrics`包实现的,将收集的metrics分为:Counter,Gauge,Histogram、Meter、Timer: + +- Meter 主要用于统计系统中某一个事件的速率,可以反应系统当前的处理能力,帮助我们判断资源是否已经不足。可以很方便帮助我们统计,每秒请求数(TPS)、每秒查询数(QPS)、最近 1 分钟平均每秒请求数、最近 5 分钟平均每秒请求数、最近 15 分钟平均每秒请求数等。 +- Guage 是最简单的度量指标,只有一个简单的返回值,通常用来记录一些对象或者事物的瞬时值。通过 Gauge 可以完成自定义的度量类型,可以用于衡量一个待处理队列中任务的个数,以及目前内存使用量等等场景。 +- Counter 是累计型的度量指标,内部用 Gauge 封装了 AtomicLong。主要用它来统计队列中 Job 的总数;错误出现次数;服务请求数等等场景。 +- Histogram 是统计数据的分布情况的度量指标,提供了最小值,最大值,中间值,还有中位数,75 百分位,90 百分位,95 百分位,98 百分位,99 百分位,和 99.9 百分位的值。使用的场景,例如统计流量最大值、最小值、平均值、中位值等等。 +- Timer 本质是 Histogram 和 Meter 的结合,可以很方便的统计请求的速率和处理时间,例如磁盘读延迟统计,以及接口调用的延迟等信息的统计等等场景。 + +BitSail的应用示例: + +- 使用CallTracer用来统计*try*中代码的调用吞吐量和执行延迟 + - ![img](../../../../images/documents/components/metrics/prometheus/call_tracer.png) + + - 使用Timer类metric统计了函数的qps和执行耗时 + - 使用Counter类metric统计了吞吐量 +- 在DelegateFlinkWriter和DelegateSourcePipeline中统计写入和读取成功的record的字节数和失败的记录数。 + +## Prometheus和BitSai集成 + +关于Prometheus和Grafana安装使用,这里就不再过多赘述。Prometheus在正常情况下是采用pull模式从产生metric的作业或者exporter(比如专门监控主机的NodeExporter)拉取监控数据。BitSail通过PrometheusMetricReporter支持Prometheus的pull模式,通过PrometheusPushGatewayReporter支持Prometheus的push模式。 + +Prometheus 采用 pull 模式,可能由于不在一个子网或者防火墙原因,导致 Prometheus 无法直接拉取各个 target 数据。所以这里我们展示PushGateway模式的使用,PushGateway是一个中转组件,通过配置BitSail作业将metric推到PushGateway,Prometheus再从PushGateway拉取。Pushgateway 可以持久化推送给它的所有监控数据。因此,即使你的监控已经下线,prometheus 还可以拉取到旧的监控数据。 + +### common配置 + +| 参数名称 | 是否必填 | 参数枚举值 | 参数含义 | +| --------------------- | -------- | ---------------------------------------------------- | ------------------------------------- | +| metrics_reporter_type | 否 | "prometheus"、"prometheus_pushgateway"、"log"、"nop" | metrics reporter的类型,默认值为"log" | + +### metric配置 + +#### PrometheusMetricReporter + +| 参数名称 | 是否必填 | 参数默认值 | 参数含义 | +| --------------- | -------- | ---------- | ------------------------------------- | +| prometheus_host | 否 | 9999 | bitsail推送至prometheu server的端口号 | + +#### PrometheusPushGatewayReporter + +| 参数名称 | 是否必填 | 参数默认值 | 参数含义 | +| ---------------------------------- | -------- | ---------- | ------------------------------------------------------------ | +| pushgateway_host | 否 | localhost | prometheus pushgateway的host | +| pushgateway_port | 否 | 9091 | prometheus pushgateway的port | +| pushgateway_https_port | 否 | 443 | HTTPS端口号 | +| pushgateway_report_period_seconds | 否 | 1 | 向prometheus pushgateway推送数据的时间间隔 | +| pushgateway_delete_on_shutdown | 否 | TRUE | 任务结束时是否删除监控数据 | +| pushgateway_jobname | 否 | 无 | prometheus pushgateway的任务名 | +| pushgateway_default_jobName_suffix | 否 | _metric | prometheus pushgateway的任务名为空时,会使用bitsail作业任务名拼接该suffix作为默认任务名。 | + +#### Example + +```Bash +{ + "job": { + "common": { + "job_id": 313, + "instance_id": 3123, + "job_name": "bitsail_hadoop_to_print_test", + "user_name": "root", + "metrics_reporter_type": "prometheus_pushgateway", + "metric": { + "pushgateway_delete_on_shutdown": "false" + } + } +} +``` + +测试时我们通过容器开启prometheus pushgateway服务。 + +```Bash +docker run -d -p 9091:9091 \ +-v "/etc/localtime:/etc/localtime" \ +prom/pushgateway +``` + +在启动pushgateway前,需要修改Prometheus配置文件 prometheus.yml并重启服务,使其监控PushGateway。 + +```Bash +global: + scrape_interval: 15s + evaluation_interval: 60s + external_labels: + monitor: codelab-monitor +scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - 'localhost:9090' + # 添加PushGateway 监控配置 + - job_name: 'pushgateway' + static_configs: + - targets: ['localhost:9091'] +``` + +### 验证与测试 + +![img](../../../../images/documents/components/metrics/prometheus/pushgateway_metric.png) + +## Grafana仪表盘和Prometheus集成 + +![img](../../../../images/documents/components/metrics/prometheus/bitsail_dashboard.png) + +### 自定义仪表盘流程 + +#### 通过Add Data Source添加Prometheus数据源 + +![img](../../../../images/documents/components/metrics/prometheus/add_data_source.png) + +#### 通过Add Panel自定义BitSail Metric仪表盘 + +![img](../../../../images/documents/components/metrics/prometheus/add_panel.png) + +### Grafana 配置模版 + +```json +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 3, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "yPH0bsh4z" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 12, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "yPH0bsh4z" + }, + "editorMode": "builder", + "expr": "avg(record_flow_control_latency{job=\"$pushgateway_job_name\", exported_job=\"$job\", exported_instance=\"$instance\", quantile=\"0.999\"})", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "读取、写入流控延迟(ms)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "yPH0bsh4z" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "yPH0bsh4z" + }, + "editorMode": "builder", + "expr": "avg(record_invoke_latency{job=\"$pushgateway_job_name\", exported_job=\"$job\", exported_instance=\"$instance\", quantile=\"0.999\"})", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "读取、写入延迟 (ms)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "yPH0bsh4z" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "yPH0bsh4z" + }, + "editorMode": "builder", + "expr": "avg(max(record_flow_control_latency_count{job=\"$pushgateway_job_name\", exported_job=\"$job\", exported_instance=\"$instance\"}))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Record Success Count (Records/s)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "yPH0bsh4z" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "yPH0bsh4z" + }, + "editorMode": "code", + "expr": "max(record_failed_count{job=\"$pushgateway_job_name\", exported_job=\"$job\", exported_instance=\"$instance\"})", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Record Failed Count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "yPH0bsh4z" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "yPH0bsh4z" + }, + "editorMode": "builder", + "expr": "max(record_success_count{job=\"$pushgateway_job_name\", exported_job=\"$job\", exported_instance=\"$instance\"})", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Record Success Count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "yPH0bsh4z" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "yPH0bsh4z" + }, + "editorMode": "code", + "expr": "max(record_success_bytes{job=\"$pushgateway_job_name\", exported_job=\"$job\", exported_instance=\"$instance\"})", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Record Success Bytes", + "type": "timeseries" + } + ], + "refresh": "5s", + "revision": 1, + "schemaVersion": 38, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "pushgateway", + "value": "pushgateway" + }, + "hide": 0, + "name": "pushgateway_job_name", + "options": [ + { + "selected": true, + "text": "pushgateway", + "value": "pushgateway" + } + ], + "query": "pushgateway", + "skipUrlSync": false, + "type": "textbox" + }, + { + "current": { + "selected": false, + "text": "bitsail_hadoop_to_print_test_metric", + "value": "bitsail_hadoop_to_print_test_metric" + }, + "hide": 0, + "name": "job", + "options": [ + { + "selected": true, + "text": "bitsail_hadoop_to_print_test_metric", + "value": "bitsail_hadoop_to_print_test_metric" + } + ], + "query": "bitsail_hadoop_to_print_test_metric", + "skipUrlSync": false, + "type": "textbox" + }, + { + "current": { + "selected": false, + "text": "3123", + "value": "3123" + }, + "hide": 0, + "name": "instance", + "options": [ + { + "selected": true, + "text": "3123", + "value": "3123" + } + ], + "query": "3123", + "skipUrlSync": false, + "type": "textbox" + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "BitSail", + "uid": "Nf9bXCb4k", + "version": 1, + "weekStart": "" +} +``` \ No newline at end of file