Skip to content

Commit 46ccbea

Browse files
Fix cleanup, and restart logic + fix retry logic bug (#351)
* Fix cleanup, and restart logic + fix retry logic bug * update readme * changes to restart function - allow setting tag to restart function * fix readme
1 parent 9e20b1b commit 46ccbea

File tree

2 files changed

+100
-51
lines changed

2 files changed

+100
-51
lines changed

scripts/launch-nodes/README.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -88,14 +88,14 @@ Once the configuration file is ready, make the python script executable and run
8888

8989
```bash
9090
chmod +x manage_subspace.py
91-
python manage_subspace.py --config nodes.toml --release_version gemini-3h-2024-sep-17 --subspace_dir /home/ubuntu/subspace/subspace \
92-
--pot_external_entropy random_value --network gemini-3h --plot-size 10G --cache-percentage 15
91+
python manage_subspace.py --config nodes.toml --release_version docker-tag --subspace_dir /home/ubuntu/subspace/subspace \
92+
--pot_external_entropy random_value --network devnet --plot_size 10G --cache_percentage 15
9393
9494
# prune images
95-
python manage_subspace.py --config nodes.toml --release_version gemini-3h-2024-sep-17 --subspace_dir /home/ubuntu/subspace/subspace --prune
95+
python manage_subspace.py --config nodes.toml --release_version docker-tag --subspace_dir /home/ubuntu/subspace/subspace --network devnet --prune
9696
9797
# restart stack
98-
python manage_subspace.py --config nodes.toml --release_version gemini-3h-2024-sep-17 --subspace_dir /home/ubuntu/subspace/subspace --restart
98+
python manage_subspace.py --config nodes.toml --release_version docker-tag --subspace_dir /home/ubuntu/subspace/subspace --network devnet --restart
9999
100100
```
101101

@@ -106,8 +106,8 @@ python manage_subspace.py --config nodes.toml --release_version gemini-3h-2024-s
106106
- `--subspace_dir`: Path to the Subspace directory (default: /home/ubuntu/subspace).
107107
- `--pot_external_entropy`: The random seed for proof of time entropy.
108108
- `--network`: The network name to be updated in the .env file.
109-
- `--plot-size`: Plot size to be set for Farmer nodes (e.g., 10G).
110-
- `--cache-percentage`: Cache percentage to be set for Farmer nodes.
109+
- `--plot_size`: Plot size to be set for Farmer nodes (e.g., 10G).
110+
- `--cache_percentage`: Cache percentage to be set for Farmer nodes.
111111
- `--prune`: Stop containers and remove unused Docker images.
112112
- `--restart`: Restart containers without wiping data.
113113

scripts/launch-nodes/manage_subspace.py

Lines changed: 94 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -35,28 +35,23 @@ def ssh_connect(host, user, key_file):
3535
logger.error(f"Failed to connect to {host}: {e}")
3636
raise
3737

38-
def run_command(client, command, retries=3, delay=5):
39-
"""Run a command over SSH with retries."""
40-
for attempt in range(retries):
41-
try:
42-
stdin, stdout, stderr = client.exec_command(command)
43-
stdout.channel.recv_exit_status()
44-
output = stdout.read().decode('utf-8')
45-
error = stderr.read().decode('utf-8')
38+
def run_command(client, command):
39+
"""Run a command over SSH and return the output."""
40+
try:
41+
stdin, stdout, stderr = client.exec_command(command)
42+
output = stdout.read().decode('utf-8')
43+
error = stderr.read().decode('utf-8')
4644

47-
# Treat Docker status updates as INFO instead of ERROR
48-
if error and not any(keyword in error for keyword in ["Stopping", "Stopped", "Creating", "Started", "Removing", "Removed"]):
49-
logger.error(f"Error running command: {error.strip()}")
50-
else:
51-
logger.info(f"Command output: {output.strip()}")
52-
return output, error
53-
except Exception as e:
54-
logger.error(f"Attempt {attempt + 1} failed to run command: {e}")
55-
if attempt < retries - 1:
56-
logger.info(f"Retrying in {delay} seconds...")
57-
sleep(delay)
45+
# Treat Docker status updates as INFO instead of ERROR
46+
if error:
47+
if any(keyword in error for keyword in ["Stopping", "Stopped", "Creating", "Started", "Removing", "Removed"]):
48+
logger.info(f"Command output: {error.strip()}")
5849
else:
59-
raise
50+
logger.error(f"Error running command: {error.strip()}")
51+
return output, error
52+
except Exception as e:
53+
logger.error(f"Failed to run command: {e}")
54+
raise
6055

6156
def modify_env_file(client, subspace_dir, release_version, genesis_hash=None, pot_external_entropy=None, plot_size=None, cache_percentage=None, network=None):
6257
"""Modify the .env file to update various settings."""
@@ -94,22 +89,42 @@ def docker_compose_down(client, subspace_dir):
9489
logger.error(f"Failed to run sudo docker compose down -v: {e}")
9590
raise
9691

97-
def docker_compose_restart(client, subspace_dir):
92+
def docker_compose_restart(client, subspace_dir, docker_tag=None):
9893
"""Run sudo docker compose restart in the subspace directory."""
9994
try:
100-
command = f'cd {subspace_dir} && sudo docker compose restart'
95+
# Modify .env file if a new DOCKER_TAG is provided
96+
if docker_tag:
97+
logger.info(f"Updating DOCKER_TAG to {docker_tag} in {subspace_dir}/.env")
98+
modify_env_file(client, subspace_dir, release_version=docker_tag)
99+
100+
# Restart the containers
101+
restart_cmd = f'cd {subspace_dir} && sudo docker compose restart'
101102
logger.info(f"Running sudo docker compose restart in {subspace_dir}")
102-
run_command(client, command)
103+
run_command(client, restart_cmd)
104+
103105
except Exception as e:
104106
logger.error(f"Failed to run sudo docker compose restart: {e}")
105107
raise
106108

107109
def docker_cleanup(client, subspace_dir):
108110
"""Stop all containers, prune unused containers and images in the subspace directory."""
109111
try:
110-
command = f'cd {subspace_dir} && sudo docker stop $(sudo docker ps -q) && sudo docker container prune -f && sudo docker image prune -a -f'
111-
logger.info(f"Running Docker cleanup commands in {subspace_dir}")
112-
run_command(client, command)
112+
# Check if there are running containers
113+
check_running_containers_cmd = f'cd {subspace_dir} && sudo docker ps -q'
114+
stdout, _ = run_command(client, check_running_containers_cmd)
115+
116+
if stdout.strip(): # Only run stop command if there are running containers
117+
stop_containers_cmd = f'cd {subspace_dir} && sudo docker stop $(sudo docker ps -q)'
118+
logger.info(f"Stopping running containers in {subspace_dir}")
119+
run_command(client, stop_containers_cmd)
120+
else:
121+
logger.info("No running containers found to stop.")
122+
123+
# Prune unused containers and images
124+
prune_cmd = f'cd {subspace_dir} && sudo docker container prune -f && sudo docker image prune -a -f'
125+
logger.info(f"Pruning unused containers and images in {subspace_dir}")
126+
run_command(client, prune_cmd)
127+
113128
except Exception as e:
114129
logger.error(f"Failed to run Docker cleanup commands: {e}")
115130
raise
@@ -147,16 +162,26 @@ def grep_protocol_version(client, retries=5, interval=30):
147162
logger.error("Failed to retrieve protocol version hash after retries.")
148163
return None
149164

150-
def handle_node(client, node, subspace_dir, release_version, pot_external_entropy=None, plot_size=None, cache_percentage=None, network=None, prune=False, restart=False):
165+
def handle_node(client, node, subspace_dir, release_version, pot_external_entropy=None,
166+
plot_size=None, cache_percentage=None, network=None, prune=False, restart=False,
167+
update_genesis_hash=False, genesis_hash=None):
151168
"""Generic function to handle different node types with specified actions."""
152169
try:
153-
if restart:
154-
docker_compose_restart(client, subspace_dir)
155-
elif prune:
170+
if prune:
156171
docker_cleanup(client, subspace_dir)
172+
elif restart:
173+
docker_compose_restart(client, subspace_dir)
157174
else:
158175
docker_compose_down(client, subspace_dir)
159-
modify_env_file(client, subspace_dir, release_version, pot_external_entropy=pot_external_entropy, plot_size=plot_size, cache_percentage=cache_percentage, network=network)
176+
177+
# Update .env file with the appropriate parameters
178+
modify_env_file(client, subspace_dir, release_version,
179+
pot_external_entropy=pot_external_entropy,
180+
plot_size=plot_size,
181+
cache_percentage=cache_percentage,
182+
network=network,
183+
genesis_hash=genesis_hash if update_genesis_hash else None)
184+
160185
docker_compose_up(client, subspace_dir)
161186

162187
except Exception as e:
@@ -225,11 +250,24 @@ def main():
225250
try:
226251
logger.info(f"Connecting to RPC node {node['host']}...")
227252
client = ssh_connect(node['host'], node['user'], node['ssh_key'])
228-
handle_node(client, node, args.subspace_dir, args.release_version, pot_external_entropy=args.pot_external_entropy, network=args.network, prune=args.prune, restart=args.restart)
229253

230-
# If this is an RPC node, grep the logs for protocol version hash
254+
# Handle node operations (prune/restart will be managed here)
255+
handle_node(client, node, args.subspace_dir, args.release_version,
256+
pot_external_entropy=args.pot_external_entropy,
257+
network=args.network,
258+
prune=args.prune,
259+
restart=args.restart)
260+
261+
# Skip protocol version extraction if prune or restart is specified
262+
if args.prune or args.restart:
263+
logger.info(f"Skipping protocol version extraction for RPC node {node['host']} due to prune/restart.")
264+
continue
265+
266+
# If this is an RPC node, wait and then extract protocol version hash
231267
logger.info(f"Waiting for RPC node {node['host']} to start...")
232268
sleep(30) # Adjust sleep time as necessary
269+
270+
# Attempt to grep the protocol version from logs
233271
protocol_version_hash = grep_protocol_version(client)
234272

235273
if not protocol_version_hash:
@@ -242,18 +280,29 @@ def main():
242280
client.close()
243281

244282
# Step 4: Handle the bootstrap node, using the protocol version hash if available
245-
if protocol_version_hash:
246-
try:
247-
logger.info(f"Connecting to the bootstrap node {bootstrap_node['host']}...")
248-
client = ssh_connect(bootstrap_node['host'], bootstrap_node['user'], bootstrap_node['ssh_key'])
249-
handle_node(client, bootstrap_node, args.subspace_dir, args.release_version, genesis_hash=protocol_version_hash, pot_external_entropy=args.pot_external_entropy, network=args.network, prune=args.prune, restart=args.restart)
250-
except Exception as e:
251-
logger.error(f"Error handling bootstrap node: {e}")
252-
finally:
253-
if client:
254-
client.close()
255-
else:
256-
logger.error("Protocol version hash not found; skipping bootstrap node update.")
283+
try:
284+
logger.info(f"Connecting to the bootstrap node {bootstrap_node['host']}...")
285+
client = ssh_connect(bootstrap_node['host'], bootstrap_node['user'], bootstrap_node['ssh_key'])
286+
287+
# Warn about missing protocol version hash before proceeding with the bootstrap node update
288+
if not protocol_version_hash:
289+
logger.warning("Protocol version hash not found; proceeding with bootstrap node without genesis_hash update.")
290+
291+
# Handle node operations, updating genesis_hash only if protocol_version_hash is available
292+
handle_node(client, bootstrap_node, args.subspace_dir, args.release_version,
293+
pot_external_entropy=args.pot_external_entropy,
294+
network=args.network,
295+
prune=args.prune,
296+
restart=args.restart,
297+
update_genesis_hash=bool(protocol_version_hash),
298+
genesis_hash=protocol_version_hash)
299+
300+
except Exception as e:
301+
logger.error(f"Error handling bootstrap node: {e}")
302+
finally:
303+
if client:
304+
client.close()
305+
257306

258307
if __name__ == '__main__':
259308
main()

0 commit comments

Comments
 (0)