Skip to content

Commit c0fcc42

Browse files
Merge pull request #48 from Nordix/nuhakala/cloud-init-tool
🌱 Add a tool to collect cloud-init logs
2 parents 0f6fce5 + 5668dad commit c0fcc42

File tree

4 files changed

+268
-0
lines changed

4 files changed

+268
-0
lines changed

.github/workflows/build-images-action.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,3 +60,17 @@ jobs:
6060
QUAY_USERNAME: ${{ secrets.QUAY_USERNAME }}
6161
QUAY_PASSWORD: ${{ secrets.QUAY_PASSWORD }}
6262
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
63+
build_cloud-init-tool:
64+
name: Build cloud-init-tool container image
65+
if: github.repository == 'metal3-io/utility-images'
66+
uses: metal3-io/project-infra/.github/workflows/container-image-build.yml@main
67+
with:
68+
image-name: 'cloud-init-tool'
69+
dockerfile-directory: cloud-init-tool
70+
pushImage: true
71+
generate-sbom: true
72+
sign-image: true
73+
secrets:
74+
QUAY_USERNAME: ${{ secrets.QUAY_USERNAME }}
75+
QUAY_PASSWORD: ${{ secrets.QUAY_PASSWORD }}
76+
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

cloud-init-tool/Dockerfile

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
FROM alpine:3.19
2+
3+
# image.version is set during image build by automation
4+
LABEL org.opencontainers.image.authors="[email protected]"
5+
LABEL org.opencontainers.image.description="Metal3 Cloud init logs fetching tool"
6+
LABEL org.opencontainers.image.documentation="https://github.com/metal3-io/utility-images"
7+
LABEL org.opencontainers.image.licenses="Apache License 2.0"
8+
LABEL org.opencontainers.image.title="Metal3 Cloud Init tool"
9+
LABEL org.opencontainers.image.url="https://github.com/metal3-io/utility-images"
10+
LABEL org.opencontainers.image.vendor="Metal3-io"
11+
12+
RUN apk add openssh python3
13+
14+
COPY main.py /scripts/
15+
16+
CMD ["python3", "/scripts/main.py"]

cloud-init-tool/README.md

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# Cloud-init tool
2+
3+
This tool is used to fetch cloud-init status and logs from BaremetalHost. It
4+
accesses the host over SSH.
5+
6+
## Using cloud-init tool
7+
8+
The tool requires two things:
9+
10+
- `TARGET_HOSTS` env variable containing node IP addresses
11+
- SSH key as bind mount
12+
13+
In addition, the user can affect the behavior with following variables:
14+
15+
- `EXTRA_COMMANDS` to run extra commands on BMH
16+
- `PRINT_TO_STDOUT` to print results to container stdout as well
17+
Mount the log directory to host machine to access the logs
18+
- `SSH_TIMEOUT` to set ssh connection timeout in seconds (default: 10)
19+
20+
The `TARGET_HOSTS` must give the target hosts in format `user@ip-address`.
21+
Multiple hosts can be defined by separating the hosts with `;`. Extra commands
22+
are given in env variable `EXTRA_COMMANDS` and they are separated by `;`. The
23+
output of extra commands are also written to log files under the log directory.
24+
The output of all commands can also be written to container stdout, this can be
25+
enabled with the `PRINT_TO_STDOUT` env variable. The container will write logs
26+
to `/output-logs` directory on the container.
27+
28+
Example command for usage:
29+
30+
``` sh
31+
docker run \
32+
--net=host \
33+
-e TARGET_HOSTS="[email protected]" \
34+
-e EXTRA_COMMANDS="cat /etc/os-release; ls -l /etc" \
35+
-v "${HOME}/.ssh":/root/.ssh \
36+
-v ./output-logs:/output-logs \
37+
cloud-init-tool
38+
```
39+
40+
Notice that `--net=host` is most likely wanted to avoid connection issues.
41+
42+
## Container exit status
43+
44+
The tool returns exit code 0 if all the given hosts have bootstrapped
45+
successfully. If the tool cannot execute properly, for example due to connection
46+
issue, or any of the hosts have not bootstrapped successfully, the tool returns
47+
exit code 1.
48+
49+
The same exit status will also be the exit status of `docker run` command. So
50+
the bootstrapping status can be simply checked by checking the exit code of
51+
`docker run` command.

cloud-init-tool/main.py

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
import subprocess
2+
import os
3+
import sys
4+
from enum import Enum
5+
6+
7+
# The env variables used to configure the program.
8+
class Vars(Enum):
9+
HOSTS = "TARGET_HOSTS"
10+
EXTRA = "EXTRA_COMMANDS"
11+
PRINT = "PRINT_TO_STDOUT"
12+
SSH_TIMEOUT = "SSH_TIMEOUT"
13+
14+
15+
DEFAULT_LOGS = {
16+
"cloud-init-output": "sudo cat /var/log/cloud-init-output.log",
17+
"cloud-init": "sudo cat /var/log/cloud-init.log",
18+
"systemctl-status-cloud-init": "systemctl status cloud-init",
19+
"journalctl-cloud-init": "sudo journalctl -u cloud-init --no-pager",
20+
}
21+
22+
LOG_DIR = "/output-logs"
23+
SUCCESS_COMMAND = "sudo cat /run/cluster-api/bootstrap-success.complete"
24+
25+
26+
# Parses the env variables and saves them in more usable form to dictionary.
27+
def parse_env() -> dict:
28+
cfg = {}
29+
for var in Vars:
30+
cfg[var] = os.environ.get(var.value, "")
31+
32+
if not cfg[Vars.HOSTS]:
33+
print(f"ERROR: environment variable {Vars.HOSTS.value} is required")
34+
sys.exit(1)
35+
36+
# Normalize values
37+
cfg[Vars.HOSTS] = [c.strip() for c in cfg[Vars.HOSTS].split(";")]
38+
cfg[Vars.EXTRA] = [c.strip() for c in cfg[Vars.EXTRA].split(";") if c.strip()]
39+
if cfg[Vars.PRINT] == "true":
40+
cfg[Vars.PRINT] = True
41+
else:
42+
cfg[Vars.PRINT] = False
43+
44+
if not cfg[Vars.SSH_TIMEOUT]:
45+
cfg[Vars.SSH_TIMEOUT] = 10
46+
else:
47+
try:
48+
cfg[Vars.SSH_TIMEOUT] = int(cfg[Vars.SSH_TIMEOUT])
49+
except ValueError as e:
50+
print("ERROR: SSH_TIMEOUT variable did not contain a number, exiting.")
51+
sys.exit(1)
52+
53+
return cfg
54+
55+
56+
def run_ssh_command(
57+
ip: str,
58+
command: str,
59+
file: str,
60+
write_to_file: bool,
61+
write_to_stdout: bool,
62+
ssh_timeout: int,
63+
) -> bool:
64+
try:
65+
# BatchMode: execute commands without being prompted
66+
# StrictHostKeyChecking: accept server's keys always
67+
res = subprocess.run(
68+
[
69+
"ssh",
70+
"-o",
71+
"BatchMode=yes",
72+
"-o",
73+
"StrictHostKeyChecking=no",
74+
ip,
75+
command,
76+
],
77+
capture_output=True,
78+
timeout=ssh_timeout,
79+
text=True,
80+
)
81+
82+
if write_to_file:
83+
with open(file, "w", encoding="utf-8") as f:
84+
f.write(
85+
f"Command '{command}' exited with returncode {res.returncode}.\n"
86+
)
87+
f.write(f"stdout: {res.stdout}")
88+
f.write(f"stderr: {res.stderr}")
89+
90+
if write_to_stdout:
91+
print(f"Command '{command}' exited with returncode {res.returncode}.\n")
92+
print(f"stdout: {res.stdout}")
93+
print(f"stderr: {res.stderr}")
94+
95+
if res.returncode != 0:
96+
return False
97+
98+
return True
99+
except Exception as e:
100+
print(f"ERROR: ssh command terminated with: {e}")
101+
return False
102+
103+
104+
def main():
105+
cfg = parse_env()
106+
107+
# Create log directory
108+
try:
109+
os.mkdir(LOG_DIR)
110+
except FileExistsError:
111+
print(f"Directory '{LOG_DIR}' already exists.")
112+
except PermissionError:
113+
print("ERROR: cannot create working directory, permission denied, aborting.")
114+
sys.exit(1)
115+
116+
# Iterate over all hosts
117+
successful_nodes = 0
118+
for ip in cfg[Vars.HOSTS]:
119+
# Check host availability
120+
success = run_ssh_command(ip, "true", "", False, False, cfg[Vars.SSH_TIMEOUT])
121+
if not success:
122+
print(
123+
f"ERROR: node {ip} not available. ",
124+
"Have you mounted SSH keys and operate in correct network?"
125+
)
126+
continue
127+
128+
# Create node directory
129+
node_dir = f"{LOG_DIR}/{ip}"
130+
try:
131+
os.mkdir(node_dir)
132+
except FileExistsError:
133+
print(f"Directory '{node_dir}' already exists.")
134+
except PermissionError:
135+
print(
136+
f"ERROR: cannot create node directory, permission denied, skipping node {ip}."
137+
)
138+
continue
139+
140+
# Check the bootstrapping success file
141+
success = run_ssh_command(
142+
ip,
143+
SUCCESS_COMMAND,
144+
f"{node_dir}/bootstrap-success.log",
145+
True,
146+
cfg[Vars.PRINT],
147+
cfg[Vars.SSH_TIMEOUT],
148+
)
149+
if success:
150+
successful_nodes += 1
151+
152+
# Collect default logs
153+
for file, command in DEFAULT_LOGS.items():
154+
run_ssh_command(
155+
ip,
156+
command,
157+
f"{node_dir}/{file}.log",
158+
True,
159+
cfg[Vars.PRINT],
160+
cfg[Vars.SSH_TIMEOUT],
161+
)
162+
163+
# Run extra commands
164+
extra_command_index = 0
165+
for command in cfg[Vars.EXTRA]:
166+
extra_command_index += 1
167+
run_ssh_command(
168+
ip,
169+
command,
170+
f"{node_dir}/{extra_command_index}.log",
171+
True,
172+
cfg[Vars.PRINT],
173+
cfg[Vars.SSH_TIMEOUT],
174+
)
175+
176+
# Produce summary and exit code
177+
total_nodes = len(cfg[Vars.HOSTS])
178+
if successful_nodes == total_nodes:
179+
print("bootstrapping for all nodes has succeeded")
180+
sys.exit(0)
181+
else:
182+
print(f"ERROR: {successful_nodes}/{total_nodes} have succeeded")
183+
sys.exit(1)
184+
185+
186+
if __name__ == "__main__":
187+
main()

0 commit comments

Comments
 (0)