-
Notifications
You must be signed in to change notification settings - Fork 23
/
fhservices.cfg.j2
364 lines (316 loc) · 11.3 KB
/
fhservices.cfg.j2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
# cannot ping localhost, so force check_dummy=0
define command{
command_name no-localhost-check
command_line /usr/lib64/nagios/plugins/check_dummy 0 "localhost host check is not checkable"
}
# Check that a component responds to a /sys/info/ping over HTTP
define command {
command_name check_fh_component_http_ping
command_line $USER1$/check_http -H $ARG1$ -p $ARG2$ -u $ARG3$
}
# Check that a component responds to a /sys/info/health over HTTP
define command {
command_name check_fh_component_health
command_line /opt/rhmap/nagios/plugins/fh-check-component-health -H $ARG1$ -P $ARG2$ -E $ARG3$
}
# Check that an EAP component responds to a ping over HTTP
define command {
command_name check_eap_component_health
command_line $USER1$/check_http -H $ARG1$ -p $ARG2$ -u $ARG3$ -e "$ARG4$"
}
# Check for valid response from Memcached
define command {
command_name check_memcached
command_line $USER1$/check_tcp -H memcached -p $ARG1$ -E -s 'version\n' -e 'VERSION' -w2 -c5 -t5
}
# Check that jenkins response to /login over HTTP
define command {
command_name check_jenkins
command_line $USER1$/check_http -H jenkins -p $ARG1$ -u $ARG2$
}
# Check the status of each mac server via jenkins
define command {
command_name ios_machine_health
command_line /opt/rhmap/nagios/plugins/ios_machine_health -H $ARG1$ -P $ARG2$ -u $ARG3$ -p $ARG4$
}
# Check the status of the android-sdk
define command {
command_name android_sdk_health
command_line /opt/rhmap/nagios/plugins/android_sdk_health
}
define command {
command_name android_pvc_check
command_line /opt/rhmap/nagios/plugins/android_pvc_check
}
# Check for valid response from redis
define command {
command_name check_redis
command_line $USER1$/check_tcp -H redis -p $ARG1$ -E -s 'echo test\n' -e '$$4\n' -e 'test' -q 'QUIT' -w2 -c5 -t5
}
define command {
command_name check_nagios_host_alive
command_line $USER1$/check_ping -H $HOSTADDRESS$ -w 3000,100% -c 5000,100% -p 1
}
define command {
command_name check_pod_disk_usage
command_line /opt/rhmap/nagios/plugins/disk-usage -w $ARG1$ -c $ARG2$
}
define command {
command_name check_mongo_pod_disk_usage
command_line /opt/rhmap/nagios/plugins/disk-usage-mongo -w $ARG1$ -c $ARG2$
}
define command {
command_name check_pod_memory_usage
command_line /opt/rhmap/nagios/plugins/memory-usage -w $ARG1$ -c $ARG2$
}
define command {
command_name check_pod_cpu_usage
command_line /opt/rhmap/nagios/plugins/cpu-usage -w $ARG1$ -c $ARG2$
}
define command {
command_name check_pod_resource_limits
command_line /opt/rhmap/nagios/plugins/resource-limits
}
define command {
command_name check_mongodb
command_line /opt/rhmap/nagios/plugins/mongodb-health --container $ARG1$
}
define command {
command_name check_mongodb_ping
command_line $USER1$/check_tcp -H $ARG1$ -p $ARG2$ -w2 -c5 -t5
}
define command {
command_name check_mongodb_affinity
command_line /opt/rhmap/nagios/plugins/mongodb-affinity
}
define command {
command_name check_mysql
command_line /opt/rhmap/nagios/plugins/mysql-health --service $ARG1$ --containers $ARG2$
}
define command {
command_name check_mysql_master_slave
command_line /opt/rhmap/nagios/plugins/mysql-master-slave-health --service $ARG1$ --containers $ARG2$
}
define contact {
contact_name rhmapadmin
use generic-contact
alias RHMAP Admin
email {{ rhmap_admin_email }}
}
define contactgroup {
contactgroup_name rhmapadmins
alias RHMAP Administrators
members rhmapadmin
}
define hostgroup {
hostgroup_name core
alias core
}
define hostgroup {
hostgroup_name mbaas
alias mbaas
}
define hostgroup {
hostgroup_name digger
alias digger
}
# Host template
define host {
name fh-component-container
use generic-host
check_period 24x7
check_interval 5
retry_interval 1
max_check_attempts 10
check_command no-localhost-check
notification_period 24x7
notification_interval 120
notification_options d,u,r
contact_groups rhmapadmins
register 0
}
# This is a dummy host really, just so the host doesn't go red
define host {
use fh-component-container
address {{ rhmap_router_dns }}
host_name {{ rhmap_router_dns }}
hostgroups {{ rhmap_hostgroups }}
}
# Service templates
define service {
name database-service
use generic-service
normal_check_interval 1
register 0
}
define service {
name fh-service
use generic-service
normal_check_interval 5
register 0
}
define service {
name fh-fifteen-minute-service
use generic-service
normal_check_interval 15
register 0
}
{% for fhservice in fh_services if 'ping' in fhservice['checks'] %}
define service {
service_description {{ fhservice['name'] }}::Ping
check_command check_fh_component_http_ping!{{ fhservice.get("service", fhservice['name']) }}!{{ fhservice.get("port", "8080") }}!{{ fhservice.get("ping_endpoint", "/sys/info/ping") }}
use fh-service
hostgroup_name {{ ",".join(fhservice['hostgroups']) }}
notes This server cannot access {{ fhservice['name'] }}, check it is running and that this server is able to talk to it on port 8080
contact_groups rhmapadmins
}
{% endfor %}
{% for fhservice in fh_services if 'health' in fhservice['checks'] %}
define service {
service_description {{ fhservice['name'] }}::Health
check_command check_fh_component_health!{{ fhservice.get("service", fhservice['name']) }}!{{ fhservice.get("port", "8080") }}!{{ fhservice.get("health_endpoint", "/sys/info/health") }}
use fh-service
hostgroup_name {{ ",".join(fhservice['hostgroups']) }}
notes This server failed to get a successful response from the {{ fhservice['name'] }} health endpoint. Check the response code & body and ensure the service and its dependencies are running and configured correctly.
contact_groups rhmapadmins
}
{% endfor %}
define service {
service_description keycloak:Health
check_command check_eap_component_health!ups!8080!/auth/admin!HTTP/1.1 302
use fh-service
hostgroup_name core
notes This server failed to get a successful response from the Keycloak service health endpoint. Check the response code & body and ensure the service and its dependencies are running and configured correctly.
contact_groups rhmapadmins
}
define service {
service_description memcached:Ping
check_command check_memcached!11211
use fh-service
hostgroup_name core
notes This should be able to communicate with memcached on port 11211 and cannot. Check it is running and this server can communicate with it on that port.
contact_groups rhmapadmins
}
define service {
service_description redis:Ping
check_command check_redis!6379
use fh-service
hostgroup_name core
notes This should be able to communicate with redis on port 6379 and cannot. Check it is running and this server can communicate with it on that port.
contact_groups rhmapadmins
}
define service {
service_description Storage::Pod::Disk Usage
check_command check_pod_disk_usage!80!90
use fh-service
hostgroup_name core,mbaas,digger
notes One or more pods have volumes which are running out of disk. You should clean up the files on this pod, if possible, or scale down/up the pod so that it has a fresh filesystem.
contact_groups rhmapadmins
}
{%if mongodb_instances > 1 %}
define service {
service_description mongodb::replicaset
check_command check_mongodb!mongodb,mongodb-service
use database-service
hostgroup_name core,mbaas
notes The mongodb replica set may not be functioning properly. There should be one and only one primary member and zero or more secondary members
contact_groups rhmapadmins
}
define service {
service_description mongodb::affinity
check_command check_mongodb_affinity
use database-service
hostgroup_name core,mbaas
notes This check ensures that each mongodb pod is on a seperate node
contact_groups rhmapadmins
}
{% endif %}
define service {
service_description jenkins::ping
check_command check_jenkins!80!/login
use fh-service
hostgroup_name digger
contact_groups rhmapadmins
}
{%if check_mac %}
define service {
service_description mac_ios::ping
check_command ios_machine_health!{{ jenkins_host }}!{{ jenkins_port }}!{{ jenkins_user }}!{{ jenkins_pass }}
use fh-service
hostgroup_name digger
contact_groups rhmapadmins
}
{% endif %}
{%if check_android %}
define service {
service_description android_pvc::check
check_command android_pvc_check
use fh-service
hostgroup_name digger
contact_groups rhmapadmins
}
{% endif %}
{% for mongodb_service in mongodb_services %}
define service {
service_description {{ mongodb_service }}:Ping
check_command check_mongodb_ping!{{ mongodb_service }}!27017
use database-service
hostgroup_name core,mbaas
notes This should be able to communicate with mongodb on port 27017 and cannot. Check it is running and this server can communicate with it on that port.
contact_groups rhmapadmins
}
define service {
service_description Storage::Pod::Mongo Volume Usage
check_command check_mongo_pod_disk_usage!65!75
use fh-fifteen-minute-service
hostgroup_name core,mbaas
notes One or more mongo pods have volumes which are running out of disk. You should clean up the files on this pod, if possible.
contact_groups rhmapadmins
}
{% endfor %}
{% if mysql_services|length == 1 %}
define service {
service_description mysql::health
check_command check_mysql!mysql!mysql
hostgroup_name core
use generic-service
notes The mysql instance may not be functioning properly.
contact_groups rhmapadmins
}
{% endif %}
{% if mysql_services|length > 1 %}
{% for mysql_service in mysql_services %}
define service {
service_description {{ mysql_service }}::health
check_command check_mysql_master_slave!{{ mysql_service }}!{{ mysql_service }}
hostgroup_name core
use database-service
notes The mysql instance may not be functioning properly.
contact_groups rhmapadmins
}
{% endfor %}
{% endif %}
define service {
service_description Container::Memory Usage
check_command check_pod_memory_usage!80!90
use fh-service
hostgroup_name core,mbaas,digger
notes One or more containers are running out of memory.
contact_groups rhmapadmins
}
define service {
service_description Container::CPU Usage
check_command check_pod_cpu_usage!80!90
use fh-service
hostgroup_name core,mbaas,digger
notes One or more containers have a high CPU load.
contact_groups rhmapadmins
}
define service {
service_description Container::Resource Limits
check_command check_pod_resource_limits
use fh-service
hostgroup_name core,mbaas,digger
notes One or more containers has no limit set for CPU and/or Memory.
contact_groups rhmapadmins
}