Skip to content

Commit eec5c31

Browse files
committed
add loki-stack
1 parent 703eea5 commit eec5c31

File tree

8 files changed

+248
-89
lines changed

8 files changed

+248
-89
lines changed

loki-stack/alertmanager.yml

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# 全局配置,主要配置告警方式,如邮件、webhook等
2+
global:
3+
# 每5分钟检测一次告警是否恢复
4+
resolve_timeout: 5m
5+
# 这里为 QQ 邮箱 SMTP 服务器地址,官方地址为smtp.qq.com端口为 465 或 587,同时要设置开启 POP3/SMTP 服务。
6+
smtp_smarthost: "smtp.exmail.qq.com:465"
7+
smtp_from: "[email protected]"
8+
smtp_auth_username: "[email protected]"
9+
# 这里为登录 QQ 邮箱的授权码,非 QQ 账户登录密码,否则会报错,获取方式在 QQ 邮箱服务端设置开启 POP3/SMTP 服务时会提示。
10+
smtp_auth_password: "********"
11+
# 是否使用 tls,根据环境不同,来选择开启和关闭。如果提示报错 email.loginAuth failed: 530 Must issue a STARTTLS command first,那么就需要设置为 true。着重说明一下,如果开启了 tls,提示报错 starttls failed: x509: certificate signed by unknown authority,需要在 email_configs 下配置 insecure_skip_verify: true 来跳过 tls 验证。
12+
smtp_require_tls: false
13+
wechat_api_url: "https://qyapi.weixin.qq.com/cgi-bin/"
14+
wechat_api_secret: "90TZ4irSQRR9BsOG9nQJElURhIRVeUk9whMiCCENAZY"
15+
wechat_api_corp_id: "ww2de34f35a3b39b3d"
16+
17+
# 模板
18+
templates:
19+
- "/etc/alertmanager/templates/*.tmpl"
20+
21+
# route:用来设置报警的分发策略。Prometheus的告警先是到达alertmanager的根路由(route),alertmanager的根路由不能包含任何匹配项,因为根路由是所有告警的入口点。
22+
# 另外,根路由需要配置一个接收器(receiver),用来处理那些没有匹配到任何子路由的告警(如果没有配置子路由,则全部由根路由发送告警),即缺省
23+
# 接收器。告警进入到根route后开始遍历子route节点,如果匹配到,则将告警发送到该子route定义的receiver中,然后就停止匹配了。因为在route中
24+
# continue默认为false,如果continue为true,则告警会继续进行后续子route匹配。如果当前告警仍匹配不到任何的子route,则该告警将从其上一级(
25+
# 匹配)route或者根route发出(按最后匹配到的规则发出邮件)。查看你的告警路由树,https://www.prometheus.io/webtools/alerting/routing-tree-editor/,
26+
# 将alertmanager.yml配置文件复制到对话框,然后点击"Draw Routing Tree"
27+
route:
28+
# 用于分组聚合,对告警通知按标签(label)进行分组,将具有相同标签或相同告警名称(alertname)的告警通知聚合在一个组,然后作为一个通知发送。如果想完全禁用聚合,可以设置为group_by: [...]
29+
group_by: ["alertname"]
30+
# 当一个新的告警组被创建时,需要等待'group_wait'后才发送初始通知。这样可以确保在发送等待前能聚合更多具有相同标签的告警,最后合并为一个通知发送。
31+
group_wait: 10s # 告警等待时间。告警产生后等待10s,如果有同组告警一起发出
32+
# 当第一次告警通知发出后,在新的评估周期内又收到了该分组最新的告警,则需等待'group_interval'时间后,开始发送为该组触发的新告警,可以简单理解为,group就相当于一个通道(channel)。
33+
group_interval: 10s # 两组告警的间隔时间
34+
# 告警通知成功发送后,若问题一直未恢复,需再次重复发送的间隔。
35+
repeat_interval: 1m # 重复告警的间隔时间,减少相同告警的发送频率,此处为测试设置为1分钟
36+
# 告警消息接收者,与下面配置的对应。例如常用的 email、wechat、slack、webhook 等消息通知方式。
37+
receiver: "webhook" # 默认接收者
38+
39+
receivers:
40+
- name: "webhook"
41+
webhook_configs:
42+
- url: "http://172.24.206.236:8080/webhook"
43+
send_resolved: true
44+
45+
- name: "email"
46+
email_configs:
47+
- to: '{{ template "email.to"}}'
48+
html: '{{ template "email.to.html" .}}'
49+
send_resolved: true
50+
51+
- name: "wechat"
52+
wechat_configs:
53+
- corp_id: "ww2de34f35a3b39b3d"
54+
api_secret: "90TZ4irSQRR9BsOG9nQJElURhIRVeUk9whMiCCENAZY"
55+
agent_id: "1000002"
56+
to_party: "3"
57+
message: '{{ template "wechat.default.message" . }}'
58+
send_resolved: true
59+
60+
inhibit_rules:
61+
- source_match:
62+
severity: "critical"
63+
target_match:
64+
severity: "warning"
65+
equal: ["alertname", "instance"]

loki-stack/docker-compose.yaml

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
version: "3"
2+
3+
networks:
4+
loki:
5+
6+
services:
7+
loki:
8+
image: grafana/loki:2.6.0
9+
container_name: loki
10+
volumes:
11+
- ./loki-config.yaml:/etc/loki/loki-config.yaml
12+
- ./rules:/loki/rules
13+
command: -config.file=/etc/loki/loki-config.yaml —log.level=debug -target=ruler
14+
ports:
15+
- "3100:3100"
16+
networks:
17+
- loki
18+
19+
promtail:
20+
image: grafana/promtail:2.6.0
21+
container_name: promtail
22+
volumes:
23+
- /var/log:/var/log
24+
command: -config.file=/etc/promtail/config.yml
25+
networks:
26+
- loki
27+
28+
alertmanager:
29+
image: prom/alertmanager:v0.24.0
30+
container_name: alertmanager
31+
volumes:
32+
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
33+
- ./templates:/etc/alertmanager/templates
34+
ports:
35+
- 9093:9093
36+
restart: always
37+
networks:
38+
- loki
39+
40+
grafana:
41+
image: grafana/grafana:latest
42+
container_name: grafana
43+
ports:
44+
- "3888:3000"
45+
networks:
46+
- loki

loki-stack/loki-config.yaml

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
auth_enabled: false
2+
3+
server:
4+
http_listen_port: 3100
5+
6+
common:
7+
path_prefix: /loki
8+
storage:
9+
filesystem:
10+
chunks_directory: /loki/chunks
11+
rules_directory: /loki/rules
12+
replication_factor: 1
13+
ring:
14+
kvstore:
15+
store: inmemory
16+
17+
schema_config:
18+
configs:
19+
- from: 2020-10-24
20+
store: boltdb-shipper
21+
object_store: filesystem
22+
schema: v11
23+
index:
24+
prefix: index_
25+
period: 24h
26+
27+
table_manager:
28+
# 日志保留周期开关,默认为false
29+
retention_deletes_enabled: true
30+
# 日志保留周期
31+
retention_period: 168h
32+
33+
ruler:
34+
storage:
35+
type: local
36+
local:
37+
directory: /loki/rules
38+
rule_path: /tmp/scratch
39+
40+
enable_api: true
41+
alertmanager_url: http://alertmanager:9093
42+
enable_alertmanager_v2: true
43+
44+
ring:
45+
kvstore:
46+
store: inmemory
47+
48+
flush_period: 1m

loki-stack/rules/fake/alert.yaml

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
groups:
2+
- name: "1分钟内日志中ERROR出现5次以上"
3+
rules:
4+
- alert: "1分钟内日志中ERROR出现5次以上"
5+
expr: count_over_time({level="ERROR"}[1m]) > 5
6+
for: 30s
7+
labels:
8+
severity: warnning
9+
annotations:
10+
summary: "错误日志告警"
11+
description: "1分钟内日志中ERROR出现5次以上,(当前值: {{ $value }}%)"
12+
- name: "1分钟内程序产生5次以上异常"
13+
rules:
14+
- alert: "1分钟内程序产生5次以上异常"
15+
expr: count_over_time({app="smart-diagnose"}|~"Exception"[1m]) > 5
16+
for: 30s
17+
labels:
18+
severity: warnning
19+
annotations:
20+
summary: "异常数告警"
21+
description: "1分钟内程序产生5次以上异常,(当前值: {{ $value }}%)"
22+
- name: "程序异常告警"
23+
rules:
24+
- alert: "程序异常告警"
25+
expr: 'count_over_time({app="smart-diagnose"} |~ "Exception" [1m]) > 0'
26+
for: 30s
27+
labels:
28+
severity: warnning
29+
annotations:
30+
summary: "程序异常告警"
31+
description: "{{ $labels.app }} 正在抛出异常."

loki-stack/templates/alert.tmpl

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
{{ define "email.from" }}[email protected]{{ end }}
2+
{{ define "email.to" }}[email protected]{{ end }}
3+
{{ define "email.to.html" }}
4+
{{- if gt (len .Alerts.Firing) 0 -}}{{ range .Alerts }}
5+
========= 告警通知 =========
6+
告警名称: {{ .Labels.alertname }} <br>
7+
告警级别: {{ .Labels.severity }} <br>
8+
告警主题: {{ .Annotations.summary }} <br>
9+
告警详情: {{ .Annotations.description }} <br>
10+
告警时间: {{ .StartsAt.Local.Format "2006-01-02 15:04:05" }} <br>
11+
{{ end }}{{ end -}}
12+
{{- if gt (len .Alerts.Resolved) 0 -}}{{ range .Alerts }}
13+
========= 告警恢复 =========
14+
告警名称: {{ .Labels.alertname }} <br>
15+
告警级别: {{ .Labels.severity }} <br>
16+
告警主题: {{ .Annotations.summary }}<br>
17+
告警详情: {{ .Annotations.description }}<br>
18+
告警时间: {{ .StartsAt.Local.Format "2006-01-02 15:04:05" }}<br>
19+
恢复时间: {{ .EndsAt.Local.Format "2006-01-02 15:04:05" }}<br>
20+
{{ end }}{{ end -}}
21+
{{- end }}

loki-stack/templates/wechat.tmpl

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
{{ define "wechat.default.message" }}
2+
{{- if gt (len .Alerts.Firing) 0 -}}
3+
{{- range $index, $alert := .Alerts -}}
4+
{{- if eq $index 0 }}
5+
========= 告警通知 =========
6+
告警名称:{{ $alert.Labels.alertname }}
7+
告警级别:{{ $alert.Labels.severity }}
8+
告警状态:{{ .Status }}
9+
告警主机: {{ $alert.Labels.instance }}
10+
告警主题: {{ $alert.Annotations.summary }}
11+
告警详情: {{ $alert.Annotations.message }}{{ $alert.Annotations.description}};
12+
触发阀值:{{ .Annotations.value }}
13+
告警时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
14+
========= = end = =========
15+
{{- end }}
16+
{{- end }}
17+
{{- end }}
18+
{{- if gt (len .Alerts.Resolved) 0 -}}
19+
{{- range $index, $alert := .Alerts -}}
20+
{{- if eq $index 0 }}
21+
========= 告警恢复 =========
22+
告警名称:{{ $alert.Labels.alertname }}
23+
告警级别:{{ $alert.Labels.severity }}
24+
告警状态:{{ .Status }}
25+
告警主机: {{ $alert.Labels.instance }}
26+
告警主题: {{ $alert.Annotations.summary }}
27+
告警详情: {{ $alert.Annotations.message }}{{ $alert.Annotations.description}};
28+
告警时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
29+
恢复时间: {{ ($alert.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
30+
{{- if gt (len $alert.Labels.instance) 0 }}
31+
实例信息: {{ $alert.Labels.instance }}
32+
{{- end }}
33+
========= = end = =========
34+
{{- end }}
35+
{{- end }}
36+
{{- end }}
37+
{{- end }}

loki/docker-compose.yaml

Lines changed: 0 additions & 28 deletions
This file was deleted.

loki/docker-compose.yaml.bak

Lines changed: 0 additions & 61 deletions
This file was deleted.

0 commit comments

Comments
 (0)