Skip to content

Commit af68b0a

Browse files
authored
pgBackRest: Point-In-Time Recovery (PITR) Improvements (#765)
1 parent 7149438 commit af68b0a

File tree

3 files changed

+96
-34
lines changed

3 files changed

+96
-34
lines changed

automation/inventory

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,3 @@ ansible_ssh_port='22'
6161
#ansible_ssh_pass='secretpassword' # "sshpass" package is required for use "ansible_ssh_pass"
6262
#ansible_ssh_private_key_file=
6363
#ansible_python_interpreter='/usr/bin/python3'
64-
65-
[pgbackrest:vars]
66-
#ansible_user='postgres'
67-
#ansible_ssh_pass='secretpassword'

automation/roles/patroni/tasks/main.yml

Lines changed: 93 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -507,7 +507,7 @@
507507
- name: Prepare PostgreSQL | start PostgreSQL on Master
508508
become: true
509509
become_user: postgres
510-
ansible.builtin.command: "{{ postgresql_bin_dir }}/pg_ctl start -D {{ postgresql_data_dir }} -w -t 1800"
510+
ansible.builtin.command: "{{ postgresql_bin_dir }}/pg_ctl start -D {{ postgresql_data_dir }} -w -t {{ pg_ctl_timeout | default(3600) }}"
511511
when: pg_ctl_status_result.rc == 3
512512

513513
- name: Prepare PostgreSQL | check PostgreSQL is accepting connections
@@ -584,7 +584,7 @@
584584
- name: Prepare PostgreSQL | stop PostgreSQL (will be managed by patroni)
585585
become: true
586586
become_user: postgres
587-
ansible.builtin.command: "{{ postgresql_bin_dir }}/pg_ctl stop -D {{ postgresql_data_dir }} -m fast -w -t 1800"
587+
ansible.builtin.command: "{{ postgresql_bin_dir }}/pg_ctl stop -D {{ postgresql_data_dir }} -m fast -w -t {{ pg_ctl_timeout | default(3600) }}"
588588
when: checkpoint_result.rc is defined and checkpoint_result.rc == 0
589589

590590
- name: Prepare PostgreSQL | check PostgreSQL is stopped
@@ -598,7 +598,7 @@
598598
tags: patroni, patroni_start_master
599599

600600
- block: # PITR (custom bootstrap)
601-
# Prepare (install pexpect, ruamel.yaml)
601+
# Prepare (install pexpect, ruamel.yaml)
602602
- name: Prepare | Make sure the ansible required python library is exist
603603
ansible.builtin.pip:
604604
name: "{{ item }}"
@@ -612,7 +612,8 @@
612612
environment:
613613
PATH: "{{ ansible_env.PATH }}:/usr/local/bin:/usr/bin"
614614
PIP_BREAK_SYSTEM_PACKAGES: "1"
615-
# Run PITR
615+
616+
# Run PITR
616617
- name: Stop patroni service on the Replica servers (if running)
617618
ansible.builtin.systemd:
618619
name: patroni
@@ -625,6 +626,21 @@
625626
state: stopped
626627
when: is_master | bool
627628

629+
- name: Check that PostgreSQL is stopped
630+
become: true
631+
become_user: postgres
632+
ansible.builtin.command: "{{ postgresql_bin_dir }}/pg_ctl status -D {{ postgresql_data_dir }}"
633+
register: pg_ctl_status_result
634+
changed_when: false
635+
failed_when: false
636+
637+
- name: Stop PostgreSQL
638+
become: true
639+
become_user: postgres
640+
ansible.builtin.command: >-
641+
{{ postgresql_bin_dir }}/pg_ctl stop -D {{ postgresql_data_dir }} -m fast -w -t {{ pg_ctl_timeout | default(3600) }}
642+
when: pg_ctl_status_result.rc is defined and (pg_ctl_status_result.rc != 3 and pg_ctl_status_result.rc != 4)
643+
628644
- name: Remove patroni cluster "{{ patroni_cluster_name }}" from DCS (if exist)
629645
become: true
630646
become_user: postgres
@@ -648,7 +664,7 @@
648664
ansible.builtin.command: >
649665
{{ pgbackrest_patroni_cluster_restore_command }}
650666
{{ '--target-action=promote' if pgbackrest_patroni_cluster_restore_command is search('--type=') else '' }}
651-
async: 86400 # timeout 24 hours
667+
async: "{{ cluster_restore_timeout | default(86400) }}" # timeout 24 hours
652668
poll: 0
653669
register: pgbackrest_restore_master
654670
when: is_master | bool
@@ -658,7 +674,7 @@
658674
ansible.builtin.command: >
659675
{{ pgbackrest_patroni_cluster_restore_command }}
660676
{{ '--target-action=shutdown' if pgbackrest_patroni_cluster_restore_command is search('--type=') else '' }}
661-
async: 86400 # timeout 24 hours
677+
async: "{{ cluster_restore_timeout | default(86400) }}" # timeout 24 hours
662678
poll: 0
663679
register: pgbackrest_restore_replica
664680
when: not is_master | bool and 'pgbackrest' in patroni_create_replica_methods
@@ -673,7 +689,7 @@
673689
label: "{{ item.changed }}"
674690
register: pgbackrest_restore_jobs_result
675691
until: pgbackrest_restore_jobs_result.finished
676-
retries: 2880 # timeout 24 hours
692+
retries: "{{ (cluster_restore_timeout | default(86400)) | int // 30 }}" # timeout 24 hours
677693
delay: 30
678694
when: item.ansible_job_id is defined
679695

@@ -685,20 +701,52 @@
685701
when: not keep_patroni_dynamic_json|bool
686702

687703
- name: Start PostgreSQL for Recovery
688-
ansible.builtin.command: "{{ postgresql_bin_dir }}/pg_ctl start -D {{ postgresql_data_dir }} -o '-c hot_standby=off' -w -t 1800"
704+
ansible.builtin.command: >-
705+
{{ postgresql_bin_dir }}/pg_ctl start -D {{ postgresql_data_dir }} -w -t {{ pg_ctl_timeout | default(3600) }}
706+
-o '--config-file={{ postgresql_conf_dir }}/postgresql.conf'
707+
-o '-c hot_standby=off'
708+
{% if postgresql_version | int >= 12 %}
709+
-o '-c restore_command="pgbackrest --stanza={{ pgbackrest_stanza }} archive-get %f %p"'
710+
{% endif %}
711+
-o '-c archive_command=/bin/true'
712+
-l /tmp/pg_recovery_{{ ansible_date_time.date }}.log
713+
async: "{{ pg_ctl_timeout | default(3600) }}" # run the command asynchronously
714+
poll: 0
715+
register: pg_ctl_start_result
689716
when: is_master | bool or (not is_master | bool and 'pgbackrest' in patroni_create_replica_methods)
690717

691-
- name: Waiting for PostgreSQL Recovery to complete (WAL apply)
718+
- name: Wait for the PostgreSQL start command to complete
719+
ansible.builtin.async_status:
720+
jid: "{{ pg_ctl_start_result.ansible_job_id }}"
721+
register: pg_ctl_start_job_result
722+
until: pg_ctl_start_job_result.finished
723+
retries: "{{ (pg_ctl_timeout | default(3600) | int) // 10 }}"
724+
delay: 10
725+
when: is_master | bool or (not is_master | bool and 'pgbackrest' in patroni_create_replica_methods)
726+
727+
- name: Wait for PostgreSQL recovery to complete (WAL apply)
692728
ansible.builtin.command: >-
693-
{{ postgresql_bin_dir }}/psql -p {{ postgresql_port }} -U {{ patroni_superuser_username }} -d postgres -tAXc
694-
"select pg_is_in_recovery()"
729+
{{ postgresql_bin_dir }}/psql -p {{ postgresql_port }} -U {{ patroni_superuser_username }} -d postgres
730+
-tAXc "select pg_is_in_recovery()"
695731
register: pg_is_in_recovery
696-
until: pg_is_in_recovery.stdout != "t"
697-
retries: 1200 # timeout 10 hours
732+
until: pg_is_in_recovery.stdout == "f"
733+
retries: "{{ (cluster_restore_timeout | default(86400)) | int // 30 }}" # timeout 24 hours
698734
delay: 30
699735
changed_when: false
700736
failed_when: false
701-
when: is_master | bool or (not is_master | bool and 'pgbackrest' in patroni_create_replica_methods)
737+
when: is_master | bool
738+
739+
- name: Check PostgreSQL recovery log
740+
ansible.builtin.command: "grep -A2 'recovery stopping' /tmp/pg_recovery_{{ ansible_date_time.date }}.log"
741+
register: pg_recovery_result
742+
changed_when: false
743+
failed_when: false
744+
when: is_master | bool
745+
746+
- name: PostgreSQL recovery details
747+
ansible.builtin.debug:
748+
msg: '{{ pg_recovery_result.stdout_lines }}'
749+
when: pg_recovery_result.stdout_lines is defined
702750

703751
- name: Check that PostgreSQL is stopped
704752
ansible.builtin.command: "{{ postgresql_bin_dir }}/pg_ctl status -D {{ postgresql_data_dir }}"
@@ -707,7 +755,8 @@
707755
failed_when: false
708756

709757
- name: Stop PostgreSQL
710-
ansible.builtin.command: "{{ postgresql_bin_dir }}/pg_ctl stop -D {{ postgresql_data_dir }} -m fast -w -t 1800"
758+
ansible.builtin.command: >-
759+
{{ postgresql_bin_dir }}/pg_ctl stop -D {{ postgresql_data_dir }} -m fast -w -t {{ pg_ctl_timeout | default(3600) }}
711760
when: pg_ctl_status_result.rc is defined and (pg_ctl_status_result.rc != 3 and pg_ctl_status_result.rc != 4)
712761
when: patroni_cluster_bootstrap_method == "pgbackrest"
713762
become: true
@@ -718,36 +767,38 @@
718767
tags: patroni, point_in_time_recovery
719768

720769
- block: # PITR (custom bootstrap) - disable archive_command
721-
- name: Check the patroni.dynamic.json exists
770+
- name: Check if patroni.dynamic.json exists
722771
ansible.builtin.stat:
723772
path: "{{ postgresql_data_dir }}/patroni.dynamic.json"
724773
register: patroni_dynamic_json
774+
when: not keep_patroni_dynamic_json | bool
725775

726776
- name: Remove patroni.dynamic.json file
727777
ansible.builtin.file:
728778
path: "{{ postgresql_data_dir }}/patroni.dynamic.json"
729779
state: absent
730-
when: patroni_dynamic_json.stat.exists and
731-
not keep_patroni_dynamic_json|bool
780+
when:
781+
- patroni_dynamic_json is defined
782+
- patroni_dynamic_json.stat is defined
783+
- patroni_dynamic_json.stat.exists
732784

733785
- name: Edit patroni.dynamic.json | disable archive_command (if enabled)
734786
yedit:
735787
src: "{{ postgresql_data_dir }}/patroni.dynamic.json"
736788
key: postgresql.parameters.archive_command
737789
value: "cd ." # not doing anything yet with WAL-s
738790
content_type: json
739-
when: patroni_dynamic_json.stat.exists and
740-
keep_patroni_dynamic_json|bool and disable_archive_command|bool
791+
when: disable_archive_command | bool
741792

742793
- name: Edit patroni.yml | disable archive_command (if enabled)
743794
yedit:
744795
src: /etc/patroni/patroni.yml
745796
key: bootstrap.dcs.postgresql.parameters.archive_command
746797
value: "cd ." # not doing anything yet with WAL-s
747-
when: disable_archive_command|bool
798+
when: disable_archive_command | bool
748799
when: patroni_cluster_bootstrap_method != "initdb" and
749-
(pgbackrest_install|bool or wal_g_install|bool) and
750-
(existing_pgcluster is not defined or not existing_pgcluster|bool)
800+
(pgbackrest_install | bool or wal_g_install | bool) and
801+
(existing_pgcluster is not defined or not existing_pgcluster | bool)
751802
become: true
752803
become_user: postgres
753804
tags: patroni, point_in_time_recovery
@@ -791,13 +842,27 @@
791842
"select pg_is_in_recovery()"
792843
register: pg_is_in_recovery
793844
until: pg_is_in_recovery.stdout == "f"
794-
retries: 1200 # timeout 10 hours
845+
retries: "{{ (cluster_restore_timeout | default(86400)) | int // 30 }}" # timeout 24 hours
795846
delay: 30
796847
changed_when: false
797848
failed_when: false
798849
when: patroni_cluster_bootstrap_method == "wal-g"
799850

800-
- name: Check PostgreSQL is started and accepting connections on Master
851+
- name: Wait for the Standby cluster initialization to complete
852+
ansible.builtin.uri:
853+
url: "http://{{ inventory_hostname }}:{{ patroni_restapi_port }}/standby-leader"
854+
status_code: 200
855+
register: standby_leader_result
856+
until: standby_leader_result.status == 200
857+
retries: "{{ (cluster_restore_timeout | default(86400)) | int // 30 }}" # timeout 24 hours
858+
delay: 30
859+
environment:
860+
no_proxy: "{{ inventory_hostname }}"
861+
when:
862+
- (patroni_standby_cluster.host is defined and patroni_standby_cluster.host | length > 0)
863+
- not ansible_check_mode
864+
865+
- name: Check PostgreSQL is started and accepting connections
801866
become: true
802867
become_user: postgres
803868
ansible.builtin.command: "{{ postgresql_bin_dir }}/pg_isready -p {{ postgresql_port }}"
@@ -853,8 +918,8 @@
853918
{{ postgresql_bin_dir }}/psql -p {{ postgresql_port }} -U {{ patroni_superuser_username }} -d postgres -tAXc
854919
"select pg_is_in_recovery()"
855920
register: pg_is_in_recovery
856-
until: pg_is_in_recovery.stdout != "t"
857-
retries: 1200 # timeout 10 hours
921+
until: pg_is_in_recovery.stdout == "f"
922+
retries: "{{ (cluster_restore_timeout | default(86400)) | int // 30 }}" # timeout 24 hours
858923
delay: 30
859924
changed_when: false
860925
when: is_master | bool
@@ -961,7 +1026,7 @@
9611026
status_code: 200
9621027
register: replica_result
9631028
until: replica_result.status == 200
964-
retries: 1200 # timeout 10 hours
1029+
retries: "{{ (cluster_restore_timeout | default(86400)) | int // 30 }}" # timeout 24 hours
9651030
delay: 30
9661031
environment:
9671032
no_proxy: "{{ inventory_hostname }}"

automation/vars/main.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -460,11 +460,11 @@ patroni_create_replica_methods:
460460
- basebackup
461461

462462
pgbackrest:
463-
- { option: "command", value: "/usr/bin/pgbackrest --stanza={{ pgbackrest_stanza }} --delta restore" }
463+
- { option: "command", value: "{{ pgbackrest_patroni_cluster_restore_command }}" }
464464
- { option: "keep_data", value: "True" }
465465
- { option: "no_params", value: "True" }
466466
wal_g:
467-
- { option: "command", value: "{{ wal_g_path }} backup-fetch {{ postgresql_data_dir }} LATEST" }
467+
- { option: "command", value: "{{ wal_g_patroni_cluster_bootstrap_command }}" }
468468
- { option: "no_params", value: "True" }
469469
basebackup:
470470
- { option: "max-rate", value: "1000M" }
@@ -645,6 +645,7 @@ pgbackrest_cron_jobs:
645645
# PITR mode (if patroni_cluster_bootstrap_method: "pgbackrest" or "wal-g"):
646646
# 1) The database cluster directory will be cleaned (for "wal-g") or overwritten (for "pgbackrest" --delta restore).
647647
# 2) And also the patroni cluster "{{ patroni_cluster_name }}" will be removed from the DCS (if exist) before recovery.
648+
cluster_restore_timeout: 86400 # backup and WAL restore timeout in seconds (24 hours)
648649

649650
disable_archive_command: true # or 'false' to not disable archive_command after restore
650651
keep_patroni_dynamic_json: true # or 'false' to remove patroni.dynamic.json after restore (if exists)

0 commit comments

Comments
 (0)