|
| 1 | +import subprocess |
| 2 | +import os |
| 3 | +import sys |
| 4 | +from enum import Enum |
| 5 | + |
| 6 | + |
| 7 | +# The env variables used to configure the program. |
| 8 | +class Vars(Enum): |
| 9 | + HOSTS = "TARGET_HOSTS" |
| 10 | + EXTRA = "EXTRA_COMMANDS" |
| 11 | + PRINT = "PRINT_TO_STDOUT" |
| 12 | + SSH_TIMEOUT = "SSH_TIMEOUT" |
| 13 | + |
| 14 | + |
| 15 | +DEFAULT_LOGS = { |
| 16 | + "cloud-init-output": "sudo cat /var/log/cloud-init-output.log", |
| 17 | + "cloud-init": "sudo cat /var/log/cloud-init.log", |
| 18 | + "systemctl-status-cloud-init": "systemctl status cloud-init", |
| 19 | + "journalctl-cloud-init": "sudo journalctl -u cloud-init --no-pager", |
| 20 | +} |
| 21 | + |
| 22 | +LOG_DIR = "/output-logs" |
| 23 | +SUCCESS_COMMAND = "sudo cat /run/cluster-api/bootstrap-success.complete" |
| 24 | + |
| 25 | + |
| 26 | +# Parses the env variables and saves them in more usable form to dictionary. |
| 27 | +def parse_env() -> dict: |
| 28 | + cfg = {} |
| 29 | + for var in Vars: |
| 30 | + cfg[var] = os.environ.get(var.value, "") |
| 31 | + |
| 32 | + if not cfg[Vars.HOSTS]: |
| 33 | + print(f"ERROR: environment variable {Vars.HOSTS.value} is required") |
| 34 | + sys.exit(1) |
| 35 | + |
| 36 | + # Normalize values |
| 37 | + cfg[Vars.HOSTS] = [c.strip() for c in cfg[Vars.HOSTS].split(";")] |
| 38 | + cfg[Vars.EXTRA] = [c.strip() for c in cfg[Vars.EXTRA].split(";") if c.strip()] |
| 39 | + if cfg[Vars.PRINT] == "true": |
| 40 | + cfg[Vars.PRINT] = True |
| 41 | + else: |
| 42 | + cfg[Vars.PRINT] = False |
| 43 | + |
| 44 | + if not cfg[Vars.SSH_TIMEOUT]: |
| 45 | + cfg[Vars.SSH_TIMEOUT] = 10 |
| 46 | + else: |
| 47 | + try: |
| 48 | + cfg[Vars.SSH_TIMEOUT] = int(cfg[Vars.SSH_TIMEOUT]) |
| 49 | + except ValueError as e: |
| 50 | + print("ERROR: SSH_TIMEOUT variable did not contain a number, exiting.") |
| 51 | + sys.exit(1) |
| 52 | + |
| 53 | + return cfg |
| 54 | + |
| 55 | + |
| 56 | +def run_ssh_command( |
| 57 | + ip: str, |
| 58 | + command: str, |
| 59 | + file: str, |
| 60 | + write_to_file: bool, |
| 61 | + write_to_stdout: bool, |
| 62 | + ssh_timeout: int, |
| 63 | +) -> bool: |
| 64 | + try: |
| 65 | + # BatchMode: execute commands without being prompted |
| 66 | + # StrictHostKeyChecking: accept server's keys always |
| 67 | + res = subprocess.run( |
| 68 | + [ |
| 69 | + "ssh", |
| 70 | + "-o", |
| 71 | + "BatchMode=yes", |
| 72 | + "-o", |
| 73 | + "StrictHostKeyChecking=no", |
| 74 | + ip, |
| 75 | + command, |
| 76 | + ], |
| 77 | + capture_output=True, |
| 78 | + timeout=ssh_timeout, |
| 79 | + text=True, |
| 80 | + ) |
| 81 | + |
| 82 | + if write_to_file: |
| 83 | + with open(file, "w", encoding="utf-8") as f: |
| 84 | + f.write( |
| 85 | + f"Command '{command}' exited with returncode {res.returncode}.\n" |
| 86 | + ) |
| 87 | + f.write(f"stdout: {res.stdout}") |
| 88 | + f.write(f"stderr: {res.stderr}") |
| 89 | + |
| 90 | + if write_to_stdout: |
| 91 | + print(f"Command '{command}' exited with returncode {res.returncode}.\n") |
| 92 | + print(f"stdout: {res.stdout}") |
| 93 | + print(f"stderr: {res.stderr}") |
| 94 | + |
| 95 | + if res.returncode != 0: |
| 96 | + return False |
| 97 | + |
| 98 | + return True |
| 99 | + except Exception as e: |
| 100 | + print(f"ERROR: ssh command terminated with: {e}") |
| 101 | + return False |
| 102 | + |
| 103 | + |
| 104 | +def main(): |
| 105 | + cfg = parse_env() |
| 106 | + |
| 107 | + # Create log directory |
| 108 | + try: |
| 109 | + os.mkdir(LOG_DIR) |
| 110 | + except FileExistsError: |
| 111 | + print(f"Directory '{LOG_DIR}' already exists.") |
| 112 | + except PermissionError: |
| 113 | + print("ERROR: cannot create working directory, permission denied, aborting.") |
| 114 | + sys.exit(1) |
| 115 | + |
| 116 | + # Iterate over all hosts |
| 117 | + successful_nodes = 0 |
| 118 | + for ip in cfg[Vars.HOSTS]: |
| 119 | + # Check host availability |
| 120 | + success = run_ssh_command(ip, "true", "", False, False, cfg[Vars.SSH_TIMEOUT]) |
| 121 | + if not success: |
| 122 | + print( |
| 123 | + f"ERROR: node {ip} not available. ", |
| 124 | + "Have you mounted SSH keys and operate in correct network?" |
| 125 | + ) |
| 126 | + continue |
| 127 | + |
| 128 | + # Create node directory |
| 129 | + node_dir = f"{LOG_DIR}/{ip}" |
| 130 | + try: |
| 131 | + os.mkdir(node_dir) |
| 132 | + except FileExistsError: |
| 133 | + print(f"Directory '{node_dir}' already exists.") |
| 134 | + except PermissionError: |
| 135 | + print( |
| 136 | + f"ERROR: cannot create node directory, permission denied, skipping node {ip}." |
| 137 | + ) |
| 138 | + continue |
| 139 | + |
| 140 | + # Check the bootstrapping success file |
| 141 | + success = run_ssh_command( |
| 142 | + ip, |
| 143 | + SUCCESS_COMMAND, |
| 144 | + f"{node_dir}/bootstrap-success.log", |
| 145 | + True, |
| 146 | + cfg[Vars.PRINT], |
| 147 | + cfg[Vars.SSH_TIMEOUT], |
| 148 | + ) |
| 149 | + if success: |
| 150 | + successful_nodes += 1 |
| 151 | + |
| 152 | + # Collect default logs |
| 153 | + for file, command in DEFAULT_LOGS.items(): |
| 154 | + run_ssh_command( |
| 155 | + ip, |
| 156 | + command, |
| 157 | + f"{node_dir}/{file}.log", |
| 158 | + True, |
| 159 | + cfg[Vars.PRINT], |
| 160 | + cfg[Vars.SSH_TIMEOUT], |
| 161 | + ) |
| 162 | + |
| 163 | + # Run extra commands |
| 164 | + extra_command_index = 0 |
| 165 | + for command in cfg[Vars.EXTRA]: |
| 166 | + extra_command_index += 1 |
| 167 | + run_ssh_command( |
| 168 | + ip, |
| 169 | + command, |
| 170 | + f"{node_dir}/{extra_command_index}.log", |
| 171 | + True, |
| 172 | + cfg[Vars.PRINT], |
| 173 | + cfg[Vars.SSH_TIMEOUT], |
| 174 | + ) |
| 175 | + |
| 176 | + # Produce summary and exit code |
| 177 | + total_nodes = len(cfg[Vars.HOSTS]) |
| 178 | + if successful_nodes == total_nodes: |
| 179 | + print("bootstrapping for all nodes has succeeded") |
| 180 | + sys.exit(0) |
| 181 | + else: |
| 182 | + print(f"ERROR: {successful_nodes}/{total_nodes} have succeeded") |
| 183 | + sys.exit(1) |
| 184 | + |
| 185 | + |
| 186 | +if __name__ == "__main__": |
| 187 | + main() |
0 commit comments