diff --git a/automation/deploy_pgcluster.yml b/automation/deploy_pgcluster.yml index 1ca8282a61..d7f46b9138 100644 --- a/automation/deploy_pgcluster.yml +++ b/automation/deploy_pgcluster.yml @@ -400,6 +400,9 @@ - role: netdata + - role: prometheus + when: with_metric_exporters | bool + # finish (info) - role: deploy_finish diff --git a/automation/roles/common/defaults/Debian.yml b/automation/roles/common/defaults/Debian.yml index c338a2aa18..81f5887c3d 100644 --- a/automation/roles/common/defaults/Debian.yml +++ b/automation/roles/common/defaults/Debian.yml @@ -150,6 +150,14 @@ haproxy_compile_requirements: - ca-certificates - libssl-dev +# node exporter +node_exporter_version: "1.9.1" +node_exporter_package_repo: "https://github.com/prometheus/node_exporter/releases/download/v{{ node_exporter_version }}/node_exporter-{{ node_exporter_version }}.linux-{{ prometheus_architecture_map[ansible_architecture] }}.tar.gz" + +# postgres exporter +postgres_exporter_version: "0.17.1" +postgres_exporter_package_repo: "https://github.com/prometheus-community/postgres_exporter/releases/download/v{{ postgres_exporter_version }}/postgres_exporter-{{ postgres_exporter_version }}.linux-{{ prometheus_architecture_map[ansible_architecture] }}.tar.gz" + # ================================================================================================= # # Offline installation (if installation_method: "file") # diff --git a/automation/roles/common/defaults/RedHat.yml b/automation/roles/common/defaults/RedHat.yml index fa6054f21c..5efd1c1965 100644 --- a/automation/roles/common/defaults/RedHat.yml +++ b/automation/roles/common/defaults/RedHat.yml @@ -175,6 +175,14 @@ haproxy_compile_requirements: - openssl-libs - systemd-devel +# node exporter +node_exporter_version: "1.9.1" +node_exporter_package_repo: "https://github.com/prometheus/node_exporter/releases/download/v{{ node_exporter_version }}/node_exporter-{{ node_exporter_version }}.linux-{{ prometheus_architecture_map[ansible_architecture] }}.tar.gz" + +# postgres exporter +postgres_exporter_version: "0.17.1" +postgres_exporter_package_repo: "https://github.com/prometheus-community/postgres_exporter/releases/download/v{{ postgres_exporter_version }}/postgres_exporter-{{ postgres_exporter_version }}.linux-{{ prometheus_architecture_map[ansible_architecture] }}.tar.gz" + # ================================================================================================= # # Offline installation (if installation_method: "file") # diff --git a/automation/roles/common/defaults/main.yml b/automation/roles/common/defaults/main.yml index 6be8024a58..a3217ba4cf 100644 --- a/automation/roles/common/defaults/main.yml +++ b/automation/roles/common/defaults/main.yml @@ -720,3 +720,25 @@ netdata_conf: # You can fine-tune retention for each tier by setting a time limit or size limit. Setting a limit to 0 disables it. # More options you can specify in the roles/netdata/templates/netdata.conf.j2 # https://learn.netdata.cloud/docs/netdata-agent/configuration + +### Metric ### +with_metric_exporters: false # or true + +# Node Exporter +node_exporter_bin_path: /opt/prometheus/node-exporter +node_exporter_host: "" +node_exporter_port: 9100 +node_exporter_options: "" + +# Postgres Exporter +postgres_exporter_bin_path: /opt/prometheus/postgres-exporter +postgres_exporter_host: "" +postgres_exporter_port: 9187 +postgres_exporter_db: postgres +postgres_exporter_user: postgres +postgres_exporter_data_source_name: "user={{ postgres_exporter_user }} dbname={{ postgres_exporter_db }} host={{ postgresql_unix_socket_dir }} sslmode=disable" +postgres_exporter_query_filenames: + - queries-default.yml + # - queries-pg_stat_statements.yml + # - queries-pg_statio_user_tables.yml +postgres_exporter_query_directory: files/ diff --git a/automation/roles/prometheus/defaults/main.yml b/automation/roles/prometheus/defaults/main.yml new file mode 100644 index 0000000000..0611fa2e06 --- /dev/null +++ b/automation/roles/prometheus/defaults/main.yml @@ -0,0 +1,10 @@ +--- +prometheus_architecture_map: + amd64: amd64 + x86_64: amd64 + armv6l: armhfv6 + armv7l: armhfv6 + aarch64: arm64 + arm64: arm64 + 32-bit: "386" + 64-bit: amd64 diff --git a/automation/roles/prometheus/files/queries-default.yml b/automation/roles/prometheus/files/queries-default.yml new file mode 100644 index 0000000000..6abf1fb601 --- /dev/null +++ b/automation/roles/prometheus/files/queries-default.yml @@ -0,0 +1,120 @@ +--- +# {{ ansible_managed }} + +###################################################################### +# Copied from +# https://github.com/wrouesnel/postgres_exporter/blob/1afbd62ab194c045a88488d77de2f116400dedb7/queries.yaml + +pg_replication: + query: >- + SELECT EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))::INT + as lag + metrics: + - lag: + usage: "GAUGE" + description: "Replication lag behind master in seconds" + +pg_postmaster: + query: >- + SELECT pg_postmaster_start_time as start_time_seconds from + pg_postmaster_start_time() + metrics: + - start_time_seconds: + usage: "GAUGE" + description: "Time at which postmaster started" + +pg_stat_user_tables: + query: >- + SELECT schemaname, relname, seq_scan, seq_tup_read, idx_scan, + idx_tup_fetch, n_tup_ins, n_tup_upd, n_tup_del, n_tup_hot_upd, n_live_tup, + n_dead_tup, n_mod_since_analyze, last_vacuum, last_autovacuum, + last_analyze, last_autoanalyze, vacuum_count, autovacuum_count, + analyze_count, autoanalyze_count FROM pg_stat_user_tables + metrics: + - schemaname: + usage: "LABEL" + description: "Name of the schema that this table is in" + - relname: + usage: "LABEL" + description: "Name of this table" + - seq_scan: + usage: "COUNTER" + description: "Number of sequential scans initiated on this table" + - seq_tup_read: + usage: "COUNTER" + description: "Number of live rows fetched by sequential scans" + - idx_scan: + usage: "COUNTER" + description: "Number of index scans initiated on this table" + - idx_tup_fetch: + usage: "COUNTER" + description: "Number of live rows fetched by index scans" + - n_tup_ins: + usage: "COUNTER" + description: "Number of rows inserted" + - n_tup_upd: + usage: "COUNTER" + description: "Number of rows updated" + - n_tup_del: + usage: "COUNTER" + description: "Number of rows deleted" + - n_tup_hot_upd: + usage: "COUNTER" + description: >- + Number of rows HOT updated (i.e., with no separate index update + required) + - n_live_tup: + usage: "GAUGE" + description: "Estimated number of live rows" + - n_dead_tup: + usage: "GAUGE" + description: "Estimated number of dead rows" + - n_mod_since_analyze: + usage: "GAUGE" + description: "Estimated number of rows changed since last analyze" + - last_vacuum: + usage: "GAUGE" + description: >- + Last time at which this table was manually vacuumed (not counting + VACUUM FULL) + - last_autovacuum: + usage: "GAUGE" + description: >- + Last time at which this table was vacuumed by the autovacuum daemon + - last_analyze: + usage: "GAUGE" + description: "Last time at which this table was manually analyzed" + - last_autoanalyze: + usage: "GAUGE" + description: >- + Last time at which this table was analyzed by the autovacuum daemon + - vacuum_count: + usage: "COUNTER" + description: >- + Number of times this table has been manually vacuumed (not counting + VACUUM FULL) + - autovacuum_count: + usage: "COUNTER" + description: >- + Number of times this table has been vacuumed by the autovacuum daemon + - analyze_count: + usage: "COUNTER" + description: "Number of times this table has been manually analyzed" + - autoanalyze_count: + usage: "COUNTER" + description: >- + Number of times this table has been analyzed by the autovacuum daemon + +pg_database: + query: >- + SELECT pg_database.datname, pg_database_size(pg_database.datname) as size + FROM pg_database + metrics: + - datname: + usage: "LABEL" + description: "Name of the database" + - size: + usage: "GAUGE" + description: "Disk space used by the database" + +###################################################################### diff --git a/automation/roles/prometheus/files/queries-pg_stat_statements.yml b/automation/roles/prometheus/files/queries-pg_stat_statements.yml new file mode 100644 index 0000000000..cccf16e5c3 --- /dev/null +++ b/automation/roles/prometheus/files/queries-pg_stat_statements.yml @@ -0,0 +1,110 @@ +--- +# {{ ansible_managed }} + +# This requires the pg_stat_statements module (disabled by default) +# https://www.postgresql.org/docs/9.6/static/pgstatstatements.html +pg_stat_statements: + metrics: + - rolname: + description: "Name of the user who executed the statement" + usage: LABEL + - datname: + description: "Name of the database" + usage: "LABEL" + - queryid: + description: Internal hash code, computed from the statement's parse tree + usage: LABEL + - query: + description: Text of a representative statement + usage: LABEL + - calls: + description: Number of times executed + usage: COUNTER + - total_time: + description: Total time spent in the statement, in milliseconds + usage: COUNTER + - min_time: + description: Minimum time spent in the statement, in milliseconds + usage: COUNTER + - max_time: + description: Maximum time spent in the statement, in milliseconds + usage: COUNTER + - mean_time: + description: Mean time spent in the statement, in milliseconds + usage: COUNTER + - stddev_time: + description: >- + Population standard deviation of time spent in the statement, in + milliseconds + usage: COUNTER + - rows: + description: Total number of rows retrieved or affected by the statement + usage: COUNTER + - shared_blks_hit: + description: Total number of shared block cache hits by the statement + usage: COUNTER + - shared_blks_read: + description: Total number of shared blocks read by the statement + usage: COUNTER + - shared_blks_dirtied: + description: Total number of shared blocks dirtied by the statement + usage: COUNTER + - shared_blks_written: + description: Total number of shared blocks written by the statement + usage: COUNTER + - local_blks_hit: + description: Total number of local block cache hits by the statement + usage: COUNTER + - local_blks_read: + description: Total number of local blocks read by the statement + usage: COUNTER + - local_blks_dirtied: + description: Total number of local blocks dirtied by the statement + usage: COUNTER + - local_blks_written: + description: Total number of local blocks written by the statement + usage: COUNTER + - temp_blks_read: + description: Total number of temp blocks read by the statement + usage: COUNTER + - temp_blks_written: + description: Total number of temp blocks written by the statement + usage: COUNTER + - blk_read_time: + description: >- + Total time the statement spent reading blocks, in milliseconds (if + track_io_timing is enabled, otherwise zero) + usage: GAUGE + - blk_write_time: + description: >- + Total time the statement spent writing blocks, in milliseconds (if + track_io_timing is enabled, otherwise zero) + usage: GAUGE + query: >- + SELECT + r.rolname, + d.datname, + queryid, + query, + calls, + total_time, + min_time, + max_time, + mean_time, + stddev_time, + rows, + shared_blks_hit, + shared_blks_read, + shared_blks_dirtied, + shared_blks_written, + local_blks_hit, + local_blks_read, + local_blks_dirtied, + local_blks_written, + temp_blks_read, + temp_blks_written, + blk_read_time, + blk_write_time + FROM pg_stat_statements s + LEFT JOIN pg_roles r on (s.userid = r.oid) + LEFT JOIN pg_database d on (s.dbid = d.oid) diff --git a/automation/roles/prometheus/files/queries-pg_statio_user_tables.yml b/automation/roles/prometheus/files/queries-pg_statio_user_tables.yml new file mode 100644 index 0000000000..f15f8b4994 --- /dev/null +++ b/automation/roles/prometheus/files/queries-pg_statio_user_tables.yml @@ -0,0 +1,43 @@ +--- +# {{ ansible_managed }} + +# These stats may not be available +pg_statio_user_tables: + metrics: + - schemaname: + description: Name of the schema that this table is in + usage: LABEL + - relname: + description: Name of this table + usage: LABEL + - heap_blks_read: + description: Number of disk blocks read from this table + usage: COUNTER + - heap_blks_hit: + description: Number of buffer hits in this table + usage: COUNTER + - idx_blks_read: + description: Number of disk blocks read from all indexes on this table + usage: COUNTER + - idx_blks_hit: + description: Number of buffer hits in all indexes on this table + usage: COUNTER + - toast_blks_read: + description: >- + Number of disk blocks read from this table's TOAST table (if any) + usage: COUNTER + - toast_blks_hit: + description: Number of buffer hits in this table's TOAST table (if any) + usage: COUNTER + - tidx_blks_read: + description: >- + Number of disk blocks read from this table's TOAST table index (if any) + usage: COUNTER + - tidx_blks_hit: + description: >- + Number of buffer hits in this table's TOAST table index (if any) + usage: COUNTER + query: >- + SELECT schemaname, relname, heap_blks_read, heap_blks_hit, idx_blks_read, + idx_blks_hit, toast_blks_read, toast_blks_hit, tidx_blks_read, + tidx_blks_hit FROM pg_statio_user_tables diff --git a/automation/roles/prometheus/handlers/main.yml b/automation/roles/prometheus/handlers/main.yml new file mode 100644 index 0000000000..298d8015fb --- /dev/null +++ b/automation/roles/prometheus/handlers/main.yml @@ -0,0 +1,33 @@ +--- +- name: Restart postgres_exporter + ansible.builtin.systemd: + daemon_reload: true + name: prometheus-postgres-exporter + state: restarted + listen: "restart postgres_exporter" + +- name: Verify postgres_exporter is responding to requests + ansible.builtin.uri: + url: "http://{% if postgres_exporter_host != '' %}{{ postgres_exporter_host }}{% else %}localhost{% endif %}:{{ postgres_exporter_port }}/" + return_content: true + retries: 5 + delay: 3 + register: metrics_output + failed_when: "'Metrics' not in metrics_output.content" + listen: "restart postgres_exporter" + +- name: Restart node_exporter + ansible.builtin.service: + name: prometheus-node-exporter + state: restarted + listen: "restart node_exporter" + +- name: Verify node_exporter is responding to requests + ansible.builtin.uri: + url: "http://{% if node_exporter_host != '' %}{{ node_exporter_host }}{% else %}localhost{% endif %}:{{ node_exporter_port }}/" + return_content: true + retries: 5 + delay: 3 + register: metrics_output + failed_when: "'Metrics' not in metrics_output.content" + listen: "restart node_exporter" diff --git a/automation/roles/prometheus/tasks/main.yml b/automation/roles/prometheus/tasks/main.yml new file mode 100644 index 0000000000..0cb0196da9 --- /dev/null +++ b/automation/roles/prometheus/tasks/main.yml @@ -0,0 +1,6 @@ +--- +- name: Install Prometheus Node Exporter + ansible.builtin.import_tasks: node_exporter.yml + +- name: Install Prometheus Postgres Exporter + ansible.builtin.import_tasks: postgres_exporter.yml diff --git a/automation/roles/prometheus/tasks/node_exporter.yml b/automation/roles/prometheus/tasks/node_exporter.yml new file mode 100644 index 0000000000..d6da83d18f --- /dev/null +++ b/automation/roles/prometheus/tasks/node_exporter.yml @@ -0,0 +1,46 @@ +--- +- name: Create node_exporter installation directory + ansible.builtin.file: + path: "{{ node_exporter_bin_path }}" + state: directory + mode: "0755" + tags: prometheus, node_exporter + +- name: Download and extract node_exporter archive + ansible.builtin.unarchive: + src: "{{ node_exporter_package_repo }}" + dest: /tmp + remote_src: true + mode: "0755" + tags: prometheus, node_exporter + +- name: Install node_exporter binary + ansible.builtin.copy: + src: "/tmp/node_exporter-{{ node_exporter_version }}.linux-{{ prometheus_architecture_map[ansible_architecture] }}/node_exporter" + dest: "{{ node_exporter_bin_path }}" + mode: "0755" + remote_src: true + notify: "restart node_exporter" + tags: prometheus, node_exporter + +- name: Create node_exporter user + ansible.builtin.user: + name: node_exporter + shell: /sbin/nologin + state: present + tags: prometheus, node_exporter + +- name: Setup systemd service for prometheus-node-exporter + ansible.builtin.template: + src: prometheus-node-exporter.service.j2 + dest: /etc/systemd/system/prometheus-node-exporter.service + mode: "0644" + notify: "restart node_exporter" + tags: prometheus, node_exporter + +- name: Enable and start prometheus-node-exporter service + ansible.builtin.service: + name: prometheus-node-exporter + state: started + enabled: true + tags: prometheus, node_exporter diff --git a/automation/roles/prometheus/tasks/postgres_exporter.yml b/automation/roles/prometheus/tasks/postgres_exporter.yml new file mode 100644 index 0000000000..594710b4c4 --- /dev/null +++ b/automation/roles/prometheus/tasks/postgres_exporter.yml @@ -0,0 +1,63 @@ +--- +- name: Create postgres_exporter system user + ansible.builtin.user: + name: "{{ postgres_exporter_user }}" + comment: "postgres_exporter_user" + system: true + shell: /bin/false + create_home: false + when: "postgres_exporter_user != 'postgres'" + tags: prometheus, postgres_exporter + +- name: Create directories for postgres_exporter + ansible.builtin.file: + path: "{{ item }}" + state: directory + mode: "0755" + loop: + - "{{ postgres_exporter_bin_path }}" + - /etc/prometheus + tags: prometheus, postgres_exporter + +- name: Download and extract postgres_exporter archive + ansible.builtin.unarchive: + src: "{{ postgres_exporter_package_repo }}" + dest: "/tmp" + remote_src: true + mode: "0755" + tags: prometheus, postgres_exporter + +- name: Install postgres_exporter binary + ansible.builtin.copy: + src: "/tmp/postgres_exporter-{{ postgres_exporter_version }}.linux-{{ prometheus_architecture_map[ansible_architecture] }}/postgres_exporter" + dest: "{{ postgres_exporter_bin_path }}" + mode: "0755" + remote_src: true + notify: "restart postgres_exporter" + tags: prometheus, postgres_exporter + +- name: Assemble postgres_exporter queries configuration + ansible.builtin.assemble: + dest: /etc/prometheus/postgres-queries.yml + regexp: "^{{ postgres_exporter_query_filenames | join('|') }}$" + remote_src: false + src: "{{ postgres_exporter_query_directory }}" + mode: "0644" + notify: "restart postgres_exporter" + tags: prometheus, postgres_exporter + +- name: Setup systemd service for prometheus-postgres-exporter + ansible.builtin.template: + dest: /etc/systemd/system/prometheus-postgres-exporter.service + src: prometheus-postgres-exporter.service.j2 + mode: "0644" + notify: "restart postgres_exporter" + tags: prometheus, postgres_exporter + +- name: Enable and start prometheus-postgres-exporter service + ansible.builtin.systemd: + daemon_reload: true + enabled: true + name: prometheus-postgres-exporter + state: started + tags: prometheus, postgres_exporter diff --git a/automation/roles/prometheus/templates/prometheus-node-exporter.service.j2 b/automation/roles/prometheus/templates/prometheus-node-exporter.service.j2 new file mode 100644 index 0000000000..714a64f9ab --- /dev/null +++ b/automation/roles/prometheus/templates/prometheus-node-exporter.service.j2 @@ -0,0 +1,11 @@ +[Unit] +Description=NodeExporter + +[Service] +TimeoutStartSec=0 +User=node_exporter +ExecStart={{ node_exporter_bin_path }}/node_exporter --web.listen-address={{ node_exporter_host }}:{{ node_exporter_port }} {{ node_exporter_options }} +Restart=on-failure + +[Install] +WantedBy=multi-user.target diff --git a/automation/roles/prometheus/templates/prometheus-postgres-exporter.service.j2 b/automation/roles/prometheus/templates/prometheus-postgres-exporter.service.j2 new file mode 100644 index 0000000000..804614e192 --- /dev/null +++ b/automation/roles/prometheus/templates/prometheus-postgres-exporter.service.j2 @@ -0,0 +1,13 @@ +[Unit] +Description=Prometheus Postgres Exporter Server + +[Service] +User={{ postgres_exporter_user }} +Environment="DATA_SOURCE_NAME={{ postgres_exporter_data_source_name }}" +ExecStart={{ postgres_exporter_bin_path }}/postgres_exporter \ + --extend.query-path /etc/prometheus/postgres-queries.yml \ + --web.listen-address {{ postgres_exporter_host }}:{{ postgres_exporter_port }} \ + --auto-discover-databases + +[Install] +WantedBy=multi-user.target