-
Notifications
You must be signed in to change notification settings - Fork 0
/
clients.cfg
248 lines (203 loc) · 13.4 KB
/
clients.cfg
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
define host{
use linux-server
host_name nagios-client
hostgroups Mongo Servers
alias server
address 192.168.1.104
max_check_attempts 5
check_period 24x7
notification_interval 30
notification_period 24x7
}
define hostgroup{
hostgroup_name Mongo Servers
alias Mongo Servers
members nagios-client
}
define service {
use generic-service
host_name nagios-client
service_description SSH
check_command check_ssh
notifications_enabled 0
}
## MongoDB
# sudo nano /usr/local/nagios/etc/servers/clients.cfg
# 底下有 6 個 Q. 待解決, 每個 define_service 要確認適用的角色 mongod, configsvr, mongos ? 監控比對的基本單位 ? 如何驗證 ?
# Check Connection
# This will check each host that is listed in the Mongo Servers group. It will issue a warning if the connection to the server takes 2 seconds and a critical error if it takes over 4 seconds
# 檢查連線到 server 的時間。超過2秒(warning), 超過4秒(critical)
define service {
use generic-service
hostgroup_name Mongo Servers
service_description Mongo Connect Check
check_command check_mongodb!connect!30001!2!4
}
# Check Percentage of Open Connections
# This is a test that will check the percentage of free connections left on the Mongo server. In the following example it will send out an warning if the connection pool is 70% used and a critical error if it is 80% used.
# 檢查多少可用的連線數被占用。 超過70%(warning), 超過80%(critical)
define service {
use generic-service
hostgroup_name Mongo Servers
service_description Mongo Free Connections
check_command check_mongodb!connections!40000!70!80
}
# Check Replication Lag
# This is a test that will test the replication lag of Mongo servers. It will send out a warning if the lag is over 15 seconds and a critical error if its over 30 seconds. Please note that this check uses 'optime' from rs.status() which will be behind realtime as heartbeat requests between servers only occur every few seconds. Thus this check may show an apparent lag of < 10 seconds when there really isn't any. Use larger values for reliable monitoring.
# 檢查 replication lag 的時間。超過15秒(warning), 超過30秒(critical)。透過 rs.status() 查看 'optime'。
define service {
use generic-service
hostgroup_name Mongo Servers
service_description Mongo Replication Lag
check_command check_mongodb!replication_lag!30001!15!30
}
# Check Replication Lag Percentage
# This is a test that will test the replication lag percentage of Mongo servers. It will send out a warning if the lag is over 50 percents and a critical error if its over 75 percents. Please note that this check gets oplog timeDiff from primary and compares it to replication lag. When this check reaches 100 percent full resync is needed.
# 檢查 oplog 落後的百分比。 超過50%(warning), 超過70%(critical)。超過 100% 需要重新同步。
define service {
use generic-service
hostgroup_name Mongo Servers
service_description Mongo Replication Lag Percentage
check_command check_mongodb!replication_lag_percent!20001!50!75
}
# Check Memory Usage
# This is a test that will test the memory usage of Mongo server. In my example my Mongo servers have 32 gigs of memory so I'll trigger a warning if Mongo uses over 20 gigs of ram and a error if Mongo uses over 28 gigs of memory.
# 檢查使用多少記憶體。 超過 20GB(warning), 超過 28GB(critical)。
define service {
use generic-service
hostgroup_name Mongo Servers
service_description Mongo Memory Usage
check_command check_mongodb!memory!40000!20!28
}
# Check Mapped Memory Usage
# This is a test that will check the mapped memory usage of Mongo server.
# 檢查使用多少映射記憶體。 超過 20GB(warning), 超過 28GB(critial)。
define service {
use generic-service
hostgroup_name Mongo Servers
service_description Mongo Mapped Memory Usage
check_command check_mongodb!memory_mapped!40000!20!28
}
# Check Average Flush Time
# This is a test that will check the average flush time of Mongo server. In my example my Mongo I want to be warned if the average flush time is above 100ms and get an error if it's above 200ms. When you start to get a high average flush time it means your database is write bound.
# 檢查平均刷新時間。 超過 100毫秒(warning), 超過 200毫秒(critial)。 若平均刷新時間增多,表示出現 write bound 。
# 有錯誤 CRITICAL - General MongoDB Error: 'backgroundFlushing'
# 官網說 backgroundFlushing information only appears for instances that use the MMAPv1 storage engine. 應該是 Mongod 是 WT 引起的 (建議關閉此服務)
# define service {
# use generic-service
# hostgroup_name Mongo Servers
# service_description Mongo Flush Average
# check_command check_mongodb!flushing!40000!100!200
# }
# Check Last Flush Time
# This is a test that will check the last flush time of Mongo server. In my example my Mongo I want to be warned if the last flush time is above 200ms and get an error if it's above 400ms. When you start to get a high flush time it means your server might be needing faster disk or its time to shard.
# 檢查最後刷新時間。 超過 200毫秒(warning), 超過 400毫秒(critial)。若最後刷新時間增多,表示硬碟太慢,或正在做分片。
# 有錯誤 CRITICAL - General MongoDB Error: 'backgroundFlushing'
# 官網說 backgroundFlushing information only appears for instances that use the MMAPv1 storage engine. 應該是 Mongod 是 WT 引起的 (建議關閉此服務)
# define service {
# use generic-service
# hostgroup_name Mongo Servers
# service_description Mongo Last Flush Time
# check_command check_mongodb!last_flush_time!40000!200!400
# }
# Check status of mongodb replicaset
# This is a test that will check the status of nodes within a replicaset. Depending which status it is it sends a waring during status 0, 3 and 5, critical if the status is 4, 6 or 8 and a ok with status 1, 2 and 7.
# Note the trailing 2 0's keep those 0's as the check doesn't compare to anything.. So those values need to be there for the check to work.
# 檢查此節點在複製集的健康狀態。 1,2,7(OK) 0,3,5(warning) 4,6,8(critial) 。 rs.status()的 health項目。
# check_command 的行末兩個0的參數表示 -C -W 沒有 compare 任何東西。(還是要寫出來)
define service {
use generic-service
hostgroup_name Mongo Servers
service_description MongoDB state
check_command check_mongodb!replset_state!20001!0!0
}
# Check status of index miss ratio
# This is a test that will check the ratio of index hits to misses. If the ratio is high, you should consider adding indexes. I want to get a warning if the ratio is above .005 and get an error if it's above .01
# 測試查詢沒有 index 可用的比值(跟 cache沒關係)。 超過 0.005(warning), 超過 0.01(critial)。 [index_miss_ratio 公式] = 沒有使用index的查詢次數 / 總查詢次數。 數值太高,表示有些常用的查詢條件沒有建立 index。
define service {
use generic-service
hostgroup_name Mongo Servers
service_description MongoDB Index Miss Ratio
check_command check_mongodb!index_miss_ratio!20001!.005!.01
}
# Check number of databases and number of collections
# These tests will count the number of databases and the number of collections. It is usefull e.g. when your application "leaks" databases or collections. Set the warning, critical level to fit your application.
# 測試建立的 db 數量。 超過 300個(warning), 超過 500個(critial)。
# 測試建立的 coll數量。 超過 300個(warning), 超過 500個(critial)。
define service {
use generic-service
hostgroup_name Mongo Servers
service_description MongoDB Number of databases
check_command check_mongodb!databases!40000!300!500
}
define service {
use generic-service
hostgroup_name Mongo Servers
service_description MongoDB Number of collections
check_command check_mongodb!collections!40000!300!500
}
# Check size of a database
# This will check the size of a database. This is useful for keeping track of growth of a particular database. Replace nagios with the name of your database
# 檢查 db資料大小。 超過 300(warning), 超過 500(critial)。 需指定 db。 Q.不知道單位是不是 MB ?
define service {
use generic-service
hostgroup_name Mongo Servers
service_description MongoDB Database size nagios
check_command check_mongodb_database!database_size!30001!300!500!nagios
}
# Check index size of a database
# This will check the index size of a database. Overlarge indexes eat up memory and indicate a need for compaction. Replace nagios with the name of your database
# 檢查 db索引大小。 超過 50(warning), 超過 100(critial)。 需指定 db。 Q.不知道單位是不是 MB ? 若值過高,表示 index 占用大量內存,需要執行 compact指令,重整 data 和 index。
define service {
use generic-service
hostgroup_name Mongo Servers
service_description MongoDB Database index size nagios
check_command check_mongodb_database!database_indexes!40000!50!100!nagios
}
# Check index size of a collection
# This will check the index size of a collection. Overlarge indexes eat up memory and indicate a need for compaction. Replace nagios with the name of your database and mongo with the name of your collection
# 檢查 coll索引大小。 超過 50(warning), 超過 100(critial)。 需指定 db, coll。 Q.不知道單位是不是 MB ? 若值過高,表示 index 占用大量內存,需要執行 compact指令,重整 data 和 index。
# Q. 描述錯誤?
define service {
use generic-service
hostgroup_name Mongo Servers
service_description MongoDB Collection index size nagios
check_command check_mongodb_collection!collection_indexes!40000!50!100!nagios!mongo
}
# Check the primary server of replicaset
# This will check the primary server of a replicaset. This is useful for catching unexpected stepdowns of the replica's primary server. Replace your-replicaset with the name of your replicaset
# 檢查此節點是否仍為 Primary。
define service {
use generic-service
hostgroup_name Mongo Servers
service_description MongoDB Replicaset Master Monitor
check_command check_mongodb_replicaset!replica_primary!20001!0!1!sh01
}
# Check the number of queries per second
# This will check the number of queries per second on a server. Since MongoDB gives us the number as a running counter, we store the last value in the local database in the nagios_check collection. The following types are accepted: query|insert|update|delete|getmore|command
# This command will check updates per second and alert if the count is over 200 and warn if over 150
# 檢查此節點每秒的請求次數。 請求可分成 6 種: query|insert|update|delete|getmore|command 。 Q.不知道有監控 mongod 和 mongos 有沒有差別?
define service {
use generic-service
hostgroup_name Mongo Servers
service_description MongoDB Updates per Second
check_command check_mongodb_query!queries_per_second!20001!200!150!update
}
# Check Primary Connection
# This will check each host that is listed in the Mongo Servers group. It will issue a warning if the connection to the primary server of current replicaset takes 2 seconds and a critical error if it takes over 4 seconds
# 檢查此 hostgroup_name 上的每個主機,與其中 Primary的連線時間。 超過 2秒(warning), 超過 4秒(critial)。 Q.不知道 hostgroup_name 改成 host_name 有沒有差別?
define service {
use generic-service
hostgroup_name Mongo Servers
service_description Mongo Connect Check
check_command check_mongodb!connect_primary!40000!2!4
}
# Check Collection State
# This will check each host that is listed in the Mongo Servers group. It can be useful to check availability of a critical collection (locks, timeout, config server unavailable...). It will issue a critical error if find_one query failed
# 檢查此 hostgroup_name 上的每個主機,用來確認是否能對重要的 collection 正常執行 findOne()。
define service {
use generic-service
hostgroup_name Mongo Servers
service_description Mongo Collection State
check_command check_mongodb_collection!collection_state!40000!0!0!nagios!mongo
}