fhservices.cfg.j2

# cannot ping localhost, so force check_dummy=0
define command{
    command_name    no-localhost-check
    command_line    /usr/lib64/nagios/plugins/check_dummy 0 "localhost host check is not checkable"
}

# Check that a component responds to a /sys/info/ping over HTTP
define command {
       command_name   check_fh_component_http_ping
       command_line   $USER1$/check_http -H $ARG1$ -p $ARG2$ -u $ARG3$
}

# Check that a component responds to a /sys/info/health over HTTP
define command {
       command_name   check_fh_component_health
       command_line   /opt/rhmap/nagios/plugins/fh-check-component-health -H $ARG1$ -P $ARG2$ -E $ARG3$
}

# Check that an EAP component responds to a ping over HTTP
define command {
       command_name   check_eap_component_health
       command_line   $USER1$/check_http -H $ARG1$ -p $ARG2$ -u $ARG3$ -e "$ARG4$"
}

# Check for valid response from Memcached
define command {
       command_name   check_memcached
       command_line   $USER1$/check_tcp -H memcached -p $ARG1$ -E -s 'version\n' -e 'VERSION' -w2 -c5 -t5
}

# Check that jenkins response to /login over HTTP
define command {
       command_name   check_jenkins
       command_line   $USER1$/check_http -H jenkins -p $ARG1$ -u $ARG2$
}

# Check the status of each mac server via jenkins
define command {
      command_name    ios_machine_health
      command_line    /opt/rhmap/nagios/plugins/ios_machine_health -H $ARG1$ -P $ARG2$ -u $ARG3$ -p $ARG4$
}

# Check the status of the android-sdk
define command {
      command_name    android_sdk_health
      command_line    /opt/rhmap/nagios/plugins/android_sdk_health
}

define command {
      command_name    android_pvc_check
      command_line    /opt/rhmap/nagios/plugins/android_pvc_check
}


# Check for valid response from redis
define command {
       command_name   check_redis
       command_line   $USER1$/check_tcp -H redis -p $ARG1$ -E -s 'echo test\n' -e '$$4\n' -e 'test' -q 'QUIT' -w2 -c5 -t5
}

define command {
       command_name   check_nagios_host_alive
       command_line   $USER1$/check_ping -H $HOSTADDRESS$ -w 3000,100% -c 5000,100% -p 1
}

define command {
  command_name   check_pod_disk_usage
  command_line   /opt/rhmap/nagios/plugins/disk-usage -w $ARG1$ -c $ARG2$
}

define command {
  command_name   check_mongo_pod_disk_usage
  command_line   /opt/rhmap/nagios/plugins/disk-usage-mongo -w $ARG1$ -c $ARG2$
}

define command {
  command_name   check_pod_memory_usage
  command_line   /opt/rhmap/nagios/plugins/memory-usage -w $ARG1$ -c $ARG2$
}

define command {
  command_name   check_pod_cpu_usage
  command_line   /opt/rhmap/nagios/plugins/cpu-usage -w $ARG1$ -c $ARG2$
}

define command {
  command_name   check_pod_resource_limits
  command_line   /opt/rhmap/nagios/plugins/resource-limits
}

define command {
  command_name   check_mongodb
  command_line   /opt/rhmap/nagios/plugins/mongodb-health --container $ARG1$
}

define command {
  command_name   check_mongodb_ping
  command_line   $USER1$/check_tcp -H $ARG1$ -p $ARG2$ -w2 -c5 -t5
}

define command {
  command_name   check_mongodb_affinity
  command_line   /opt/rhmap/nagios/plugins/mongodb-affinity
}

define command {
  command_name   check_mysql
  command_line   /opt/rhmap/nagios/plugins/mysql-health --service $ARG1$ --containers $ARG2$
}

define command {
  command_name   check_mysql_master_slave
  command_line   /opt/rhmap/nagios/plugins/mysql-master-slave-health --service $ARG1$ --containers $ARG2$
}

define contact {
       contact_name   rhmapadmin
       use            generic-contact
       alias          RHMAP Admin
       email          {{ rhmap_admin_email }}
}

define contactgroup {
       contactgroup_name  rhmapadmins
       alias              RHMAP Administrators
       members            rhmapadmin
}

define hostgroup {
       hostgroup_name core
       alias core
}

define hostgroup {
       hostgroup_name mbaas
       alias mbaas
}
define hostgroup {
       hostgroup_name digger
       alias digger
}

# Host template
define host {
  name                    fh-component-container
  use                     generic-host
  check_period            24x7
  check_interval          5
  retry_interval          1
  max_check_attempts      10
  check_command           no-localhost-check
  notification_period     24x7
  notification_interval   120
  notification_options    d,u,r
  contact_groups          rhmapadmins
  register                0
}

# This is a dummy host really, just so the host doesn't go red
define host {
       use fh-component-container
       address {{ rhmap_router_dns }}
       host_name {{ rhmap_router_dns }}
       hostgroups {{ rhmap_hostgroups }}
}

# Service templates
define service {
       name database-service
       use generic-service
       normal_check_interval 1
       register 0
}
define service {
       name fh-service
       use generic-service
       normal_check_interval 5
       register 0
}
define service {
       name fh-fifteen-minute-service
       use generic-service
       normal_check_interval 15
       register 0
}

{% for fhservice in fh_services if 'ping' in fhservice['checks'] %}
define service {
       service_description {{ fhservice['name'] }}::Ping
       check_command check_fh_component_http_ping!{{ fhservice.get("service", fhservice['name']) }}!{{ fhservice.get("port", "8080") }}!{{ fhservice.get("ping_endpoint", "/sys/info/ping") }}
       use fh-service
       hostgroup_name {{ ",".join(fhservice['hostgroups']) }}
       notes This server cannot access {{ fhservice['name'] }}, check it is running and that this server is able to talk to it on port 8080
       contact_groups rhmapadmins
}
{% endfor %}

{% for fhservice in fh_services if 'health' in fhservice['checks'] %}
define service {
       service_description {{ fhservice['name'] }}::Health
       check_command check_fh_component_health!{{ fhservice.get("service", fhservice['name']) }}!{{ fhservice.get("port", "8080") }}!{{ fhservice.get("health_endpoint", "/sys/info/health") }}
       use fh-service
       hostgroup_name {{ ",".join(fhservice['hostgroups']) }}
       notes This server failed to get a successful response from the {{ fhservice['name'] }} health endpoint. Check the response code & body and ensure the service and its dependencies are running and configured correctly.
       contact_groups rhmapadmins
}
{% endfor %}

define service {
      service_description keycloak:Health
      check_command check_eap_component_health!ups!8080!/auth/admin!HTTP/1.1 302
      use fh-service
      hostgroup_name core
      notes This server failed to get a successful response from the Keycloak service health endpoint. Check the response code & body and ensure the service and its dependencies are running and configured correctly.
      contact_groups rhmapadmins
}

define service {
      service_description memcached:Ping
      check_command check_memcached!11211
      use fh-service
      hostgroup_name core
      notes This should be able to communicate with memcached on port 11211 and cannot. Check it is running and this server can communicate with it on that port.
      contact_groups rhmapadmins
}

define service {
      service_description redis:Ping
      check_command check_redis!6379
      use fh-service
      hostgroup_name core
      notes This should be able to communicate with redis on port 6379 and cannot. Check it is running and this server can communicate with it on that port.
      contact_groups rhmapadmins
}

define service {
       service_description Storage::Pod::Disk Usage
       check_command check_pod_disk_usage!80!90
       use fh-service
       hostgroup_name core,mbaas,digger
       notes One or more pods have volumes which are running out of disk.  You should clean up the files on this pod, if possible, or scale down/up the pod so that it has a fresh filesystem.
       contact_groups rhmapadmins
}


{%if mongodb_instances > 1 %}
define service {
       service_description mongodb::replicaset
       check_command check_mongodb!mongodb,mongodb-service
       use database-service
       hostgroup_name core,mbaas
       notes The mongodb replica set may not be functioning properly.  There should be one and only one primary member and zero or more secondary members
       contact_groups rhmapadmins
}

define service {
       service_description mongodb::affinity
       check_command check_mongodb_affinity
       use database-service
       hostgroup_name core,mbaas
       notes This check ensures that each mongodb pod is on a seperate node
       contact_groups rhmapadmins
}
{% endif %}

define service {
       service_description jenkins::ping
       check_command check_jenkins!80!/login
       use fh-service
       hostgroup_name digger
       contact_groups rhmapadmins
}

{%if check_mac %}
define service {
       service_description mac_ios::ping
       check_command ios_machine_health!{{ jenkins_host }}!{{ jenkins_port }}!{{ jenkins_user }}!{{ jenkins_pass }}
       use fh-service
       hostgroup_name digger
       contact_groups rhmapadmins

}
{% endif %}

{%if check_android %}
define service {
      service_description android_pvc::check
      check_command android_pvc_check
      use fh-service
      hostgroup_name digger
      contact_groups rhmapadmins
}
{% endif %}


{% for mongodb_service in mongodb_services %}
define service {
  service_description {{ mongodb_service }}:Ping
  check_command check_mongodb_ping!{{ mongodb_service }}!27017
  use database-service
  hostgroup_name core,mbaas
  notes This should be able to communicate with mongodb on port 27017 and cannot. Check it is running and this server can communicate with it on that port.
  contact_groups rhmapadmins
}
define service {
       service_description Storage::Pod::Mongo Volume Usage
       check_command check_mongo_pod_disk_usage!65!75
       use fh-fifteen-minute-service
       hostgroup_name core,mbaas
       notes One or more mongo pods have volumes which are running out of disk.  You should clean up the files on this pod, if possible.
       contact_groups rhmapadmins
}
{% endfor %}

{% if mysql_services|length == 1 %}
define service {
  service_description mysql::health
  check_command check_mysql!mysql!mysql
  hostgroup_name core
  use generic-service
  notes The mysql instance may not be functioning properly.
  contact_groups rhmapadmins
}
{% endif %}

{% if mysql_services|length > 1 %}
{% for mysql_service in mysql_services %}
define service {
  service_description {{ mysql_service }}::health
  check_command check_mysql_master_slave!{{ mysql_service }}!{{ mysql_service }}
  hostgroup_name core
  use database-service
  notes The mysql instance may not be functioning properly.
  contact_groups rhmapadmins
}
{% endfor %}
{% endif %}

define service {
       service_description Container::Memory Usage
       check_command check_pod_memory_usage!80!90
       use fh-service
       hostgroup_name core,mbaas,digger
       notes One or more containers are running out of memory.
       contact_groups rhmapadmins
}

define service {
       service_description Container::CPU Usage
       check_command check_pod_cpu_usage!80!90
       use fh-service
       hostgroup_name core,mbaas,digger
       notes One or more containers have a high CPU load.
       contact_groups rhmapadmins
}

define service {
       service_description Container::Resource Limits
       check_command check_pod_resource_limits
       use fh-service
       hostgroup_name core,mbaas,digger
       notes One or more containers has no limit set for CPU and/or Memory.
       contact_groups rhmapadmins
}