Skip to content

Commit e44ccba

Browse files
committed
add more features
- function to prune images - argument for network - argument for plot_size - argument for cache_percentage - function to restart compose - better handling of ssh connection with retries - improve sed logic
1 parent b3fcc09 commit e44ccba

File tree

1 file changed

+127
-123
lines changed

1 file changed

+127
-123
lines changed

scripts/launch-nodes/manage_subspace.py

Lines changed: 127 additions & 123 deletions
Original file line numberDiff line numberDiff line change
@@ -35,22 +35,53 @@ def ssh_connect(host, user, key_file):
3535
logger.error(f"Failed to connect to {host}: {e}")
3636
raise
3737

38-
def run_command(client, command):
39-
"""Run a command over SSH and return the output."""
40-
try:
41-
stdin, stdout, stderr = client.exec_command(command)
42-
output = stdout.read().decode('utf-8')
43-
error = stderr.read().decode('utf-8')
38+
def run_command(client, command, retries=3, delay=5):
39+
"""Run a command over SSH with retries."""
40+
for attempt in range(retries):
41+
try:
42+
stdin, stdout, stderr = client.exec_command(command)
43+
stdout.channel.recv_exit_status()
44+
output = stdout.read().decode('utf-8')
45+
error = stderr.read().decode('utf-8')
4446

45-
# Treat Docker status updates as INFO instead of ERROR
46-
if error:
47-
if any (keyword in error for keyword in ["Stopping", "Stopped", "Creating", "Started", "Removing", "Removed"]):
48-
logger.info(f"Command output: {error.strip()}")
49-
else:
47+
# Treat Docker status updates as INFO instead of ERROR
48+
if error and not any(keyword in error for keyword in ["Stopping", "Stopped", "Creating", "Started", "Removing", "Removed"]):
5049
logger.error(f"Error running command: {error.strip()}")
51-
return output, error
50+
else:
51+
logger.info(f"Command output: {output.strip()}")
52+
return output, error
53+
except Exception as e:
54+
logger.error(f"Attempt {attempt + 1} failed to run command: {e}")
55+
if attempt < retries - 1:
56+
logger.info(f"Retrying in {delay} seconds...")
57+
sleep(delay)
58+
else:
59+
raise
60+
61+
def modify_env_file(client, subspace_dir, release_version, genesis_hash=None, pot_external_entropy=None, plot_size=None, cache_percentage=None, network=None):
62+
"""Modify the .env file to update various settings."""
63+
try:
64+
commands = [
65+
f"sed -i '/^DOCKER_TAG=/c\\DOCKER_TAG={release_version}' {subspace_dir}/.env",
66+
f"sed -i '/^GENESIS_HASH=/c\\GENESIS_HASH={genesis_hash}' {subspace_dir}/.env" if genesis_hash else "",
67+
f"sed -i '/^POT_EXTERNAL_ENTROPY=/c\\POT_EXTERNAL_ENTROPY={pot_external_entropy}' {subspace_dir}/.env" if pot_external_entropy else "",
68+
f"sed -i '/^PLOT_SIZE=/c\\PLOT_SIZE={plot_size}' {subspace_dir}/.env" if plot_size else "",
69+
f"sed -i '/^CACHE_PERCENTAGE=/c\\CACHE_PERCENTAGE={cache_percentage}' {subspace_dir}/.env" if cache_percentage else "",
70+
f"sed -i '/^NETWORK_NAME=/c\\NETWORK_NAME={network}' {subspace_dir}/.env" if network else ""
71+
]
72+
for command in filter(bool, commands):
73+
stdout, stderr = run_command(client, command)
74+
if stderr:
75+
logger.error(f"Error modifying .env file with command: {command}, error: {stderr}")
76+
raise Exception(f"Error modifying .env file: {stderr}")
77+
else:
78+
logger.info(f"Successfully executed command: {command}")
79+
except Exception as e:
80+
logger.error(f"Failed to modify .env file: {e}")
81+
raise
82+
5283
except Exception as e:
53-
logger.error(f"Failed to run command: {command}: {e}")
84+
logger.error(f"Failed to modify .env file: {e}")
5485
raise
5586

5687
def docker_compose_down(client, subspace_dir):
@@ -63,40 +94,34 @@ def docker_compose_down(client, subspace_dir):
6394
logger.error(f"Failed to run sudo docker compose down -v: {e}")
6495
raise
6596

66-
def modify_env_file(client, subspace_dir, release_version, genesis_hash=None, pot_external_entropy=None):
67-
"""Modify the .env file to update the Docker tag, Genesis Hash, and POT_EXTERNAL_ENTROPY using sed."""
97+
def docker_compose_restart(client, subspace_dir):
98+
"""Run sudo docker compose restart in the subspace directory."""
6899
try:
69-
# Command to update DOCKER_TAG
70-
commands = [
71-
f"sed -i 's/^DOCKER_TAG=.*/DOCKER_TAG={release_version}/' {subspace_dir}/.env"
72-
]
73-
74-
# Command to update GENESIS_HASH if provided
75-
if genesis_hash:
76-
commands.append(f"sed -i 's/^GENESIS_HASH=.*/GENESIS_HASH={genesis_hash}/' {subspace_dir}/.env")
77-
78-
# Command to update POT_EXTERNAL_ENTROPY if provided
79-
if pot_external_entropy:
80-
# If POT_EXTERNAL_ENTROPY exists, replace it, otherwise append it
81-
commands.append(f"grep -q '^POT_EXTERNAL_ENTROPY=' {subspace_dir}/.env && "
82-
f"sed -i 's/^POT_EXTERNAL_ENTROPY=.*/POT_EXTERNAL_ENTROPY={pot_external_entropy}/' {subspace_dir}/.env || "
83-
f"echo 'POT_EXTERNAL_ENTROPY={pot_external_entropy}' >> {subspace_dir}/.env")
84-
85-
# Execute the commands over SSH
86-
for command in commands:
87-
logger.debug(f"Executing command: {command}")
88-
stdin, stdout, stderr = client.exec_command(command)
89-
stdout_text = stdout.read().decode()
90-
stderr_text = stderr.read().decode()
100+
command = f'cd {subspace_dir} && sudo docker compose restart'
101+
logger.info(f"Running sudo docker compose restart in {subspace_dir}")
102+
run_command(client, command)
103+
except Exception as e:
104+
logger.error(f"Failed to run sudo docker compose restart: {e}")
105+
raise
91106

92-
if stderr_text:
93-
logger.error(f"Error modifying .env file with command: {command}, error: {stderr_text}")
94-
raise Exception(f"Error modifying .env file: {stderr_text}")
95-
else:
96-
logger.info(f"Successfully executed command: {command}")
107+
def docker_cleanup(client, subspace_dir):
108+
"""Stop all containers, prune unused containers and images in the subspace directory."""
109+
try:
110+
command = f'cd {subspace_dir} && sudo docker stop $(sudo docker ps -q) && sudo docker container prune -f && sudo docker image prune -a -f'
111+
logger.info(f"Running Docker cleanup commands in {subspace_dir}")
112+
run_command(client, command)
113+
except Exception as e:
114+
logger.error(f"Failed to run Docker cleanup commands: {e}")
115+
raise
97116

117+
def docker_compose_up(client, subspace_dir):
118+
"""Run sudo docker compose up -d in the subspace directory."""
119+
try:
120+
command = f'cd {subspace_dir} && sudo docker compose up -d'
121+
logger.info(f"Running sudo docker compose up -d in {subspace_dir}")
122+
run_command(client, command)
98123
except Exception as e:
99-
logger.error(f"Failed to modify .env file: {e}")
124+
logger.error(f"Failed to run sudo docker compose up -d: {e}")
100125
raise
101126

102127
def grep_protocol_version(client, retries=5, interval=30):
@@ -122,135 +147,114 @@ def grep_protocol_version(client, retries=5, interval=30):
122147
logger.error("Failed to retrieve protocol version hash after retries.")
123148
return None
124149

125-
def docker_compose_up(client, subspace_dir):
126-
"""Run sudo docker compose up -d in the subspace directory."""
150+
def handle_node(client, node, subspace_dir, release_version, pot_external_entropy=None, plot_size=None, cache_percentage=None, network=None, prune=False, restart=False):
151+
"""Generic function to handle different node types with specified actions."""
127152
try:
128-
command = f'cd {subspace_dir} && sudo docker compose up -d'
129-
logger.info(f"Running sudo docker compose up -d in {subspace_dir}")
130-
run_command(client, command)
153+
docker_compose_down(client, subspace_dir)
154+
modify_env_file(client, subspace_dir, release_version, pot_external_entropy=pot_external_entropy, plot_size=plot_size, cache_percentage=cache_percentage, network=network)
155+
docker_compose_up(client, subspace_dir)
156+
157+
if restart:
158+
docker_compose_restart(client, subspace_dir)
159+
160+
if prune:
161+
docker_cleanup(client, subspace_dir)
162+
131163
except Exception as e:
132-
logger.error(f"Failed to run sudo docker compose up -d: {e}")
133-
raise
164+
logger.error(f"Error handling node {node['host']}: {e}")
165+
finally:
166+
if client:
167+
client.close()
134168

135169
def main():
136-
# Parse command line arguments
137170
parser = argparse.ArgumentParser(description="Manage Subspace nodes via SSH")
138171
parser.add_argument('--config', required=True, help='Path to the TOML config file')
172+
parser.add_argument('--network', required=True, help='Network to update in the .env file, i.e devnet, gemini-3h, taurus')
139173
parser.add_argument('--release_version', required=True, help='Release version to update in the .env file')
140-
parser.add_argument('--subspace_dir', default='/home/ubuntu/subspace', help='Path to the Subspace directory (default: /home/ubuntu/subspace)')
174+
parser.add_argument('--subspace_dir', default='/home/ubuntu/subspace', help='Path to the Subspace directory')
141175
parser.add_argument('--pot_external_entropy', help='POT_EXTERNAL_ENTROPY value for all nodes')
142176
parser.add_argument('--log_level', default='INFO', help='Set the logging level (DEBUG, INFO, WARNING, ERROR)')
143177
parser.add_argument('--no-timekeeper', action='store_true', help='Disable launching of the timekeeper node')
178+
parser.add_argument('--prune', action='store_true', help='Stop containers and destroy the Docker images')
179+
parser.add_argument('--restart', action='store_true', help='Restart the network without wiping the data')
180+
parser.add_argument('--plot-size', help='Set plot size on the farmer, i.e 10G')
181+
parser.add_argument('--cache-percentage', help='Set the cache percentage on the farmer, i.e 10')
144182
args = parser.parse_args()
145183

146184
# Set logging level based on user input
147185
log_level = args.log_level.upper()
148186
logging.getLogger().setLevel(log_level)
149187

150-
logger.debug(f"Received POT_EXTERNAL_ENTROPY: {args.pot_external_entropy}")
151-
152-
# Read configuration from the TOML file using tomli
188+
# Read configuration from the TOML file
153189
with open(args.config, 'rb') as f:
154190
config = tomli.load(f)
155191

156192
bootstrap_node = config['bootstrap_node']
157-
farmer_rpc_nodes = config['farmer_rpc_nodes']
193+
farmer_nodes = [node for node in config['farmer_rpc_nodes'] if node['type'] == 'farmer']
194+
rpc_nodes = [node for node in config['farmer_rpc_nodes'] if node['type'] == 'rpc']
158195
timekeeper_node = config['timekeeper']
159196

160-
release_version = args.release_version
161-
subspace_dir = args.subspace_dir
162-
163-
# Step 1: Handle the timekeeper node first, if present and --no-timekeeper is not set
197+
# Step 1: Handle the timekeeper node, if enabled
164198
if not args.no_timekeeper and timekeeper_node:
165-
client = None # Initialize the client variable
166199
try:
167-
logger.info(f"Connecting to the timekeeper node {timekeeper_node['host']}...")
200+
logger.info(f"Connecting to timekeeper node {timekeeper_node['host']}...")
168201
client = ssh_connect(timekeeper_node['host'], timekeeper_node['user'], timekeeper_node['ssh_key'])
169-
170-
# Run sudo docker compose down -v for the timekeeper node
171-
docker_compose_down(client, subspace_dir)
172-
173-
# Modify the .env file with the POT_EXTERNAL_ENTROPY value
174-
logger.debug(f"Modifying .env file for timekeeper with POT_EXTERNAL_ENTROPY={args.pot_external_entropy}")
175-
modify_env_file(client, subspace_dir, release_version, pot_external_entropy=args.pot_external_entropy)
176-
177-
# Start the timekeeper node
178-
docker_compose_up(client, subspace_dir)
179-
180-
logger.info("Timekeeper node started with the updated POT_EXTERNAL_ENTROPY value.")
202+
handle_node(client, timekeeper_node, args.subspace_dir, args.release_version, pot_external_entropy=args.pot_external_entropy, network=args.network, prune=args.prune, restart=args.restart)
181203
except Exception as e:
182-
logger.error(f"Error during timekeeper node update: {e}")
204+
logger.error(f"Error handling timekeeper node: {e}")
183205
finally:
184206
if client:
185207
client.close()
186-
logger.debug(f"Closed connection to timekeeper node {timekeeper_node['host']}")
187-
elif args.no_timekeeper:
188-
logger.info("Skipping timekeeper node as --no-timekeeper flag is set.")
189208
else:
190-
logger.warning("Timekeeper node not found, proceeding with other nodes.")
209+
logger.info("Timekeeper handling is disabled or not specified.")
191210

192-
# Step 2: Start the other farmer and RPC nodes after the timekeeper node
193-
protocol_version_hash = None
194-
for node in farmer_rpc_nodes:
195-
client = None # Initialize the client variable
211+
# Step 2: Handle farmer nodes
212+
for node in farmer_nodes:
196213
try:
197-
logger.info(f"Connecting to {node['host']} for sudo docker compose down -v...")
214+
logger.info(f"Connecting to farmer node {node['host']}...")
198215
client = ssh_connect(node['host'], node['user'], node['ssh_key'])
216+
handle_node(client, node, args.subspace_dir, args.release_version, pot_external_entropy=args.pot_external_entropy, network=args.network, plot_size=args.plot_size, cache_percentage=args.cache_percentage, prune=args.prune, restart=args.restart)
217+
except Exception as e:
218+
logger.error(f"Error handling farmer node {node['host']}: {e}")
219+
finally:
220+
if client:
221+
client.close()
199222

200-
# Run sudo docker compose down -v
201-
docker_compose_down(client, subspace_dir)
202-
203-
# Modify the .env file for farmer and RPC nodes
204-
modify_env_file(client, subspace_dir, release_version, pot_external_entropy=args.pot_external_entropy)
205-
206-
# Start sudo docker compose up -d
207-
docker_compose_up(client, subspace_dir)
208-
209-
# If this is the RPC node, grep the logs for protocol version hash
210-
if node['type'] == 'rpc':
211-
logger.info(f"Waiting for the RPC node to start...")
212-
sleep(30) # Adjust sleep time as necessary
213-
214-
logger.info(f"Grep protocol version from logs on {node['host']}...")
215-
protocol_version_hash = grep_protocol_version(client)
223+
# Step 3: Handle RPC nodes
224+
protocol_version_hash = None
225+
for node in rpc_nodes:
226+
try:
227+
logger.info(f"Connecting to RPC node {node['host']}...")
228+
client = ssh_connect(node['host'], node['user'], node['ssh_key'])
229+
handle_node(client, node, args.subspace_dir, args.release_version, pot_external_entropy=args.pot_external_entropy, network=args.network, prune=args.prune, restart=args.restart)
216230

217-
if not protocol_version_hash:
218-
logger.error(f"Failed to retrieve protocol version hash on {node['host']}")
219-
continue
231+
# If this is an RPC node, grep the logs for protocol version hash
232+
logger.info(f"Waiting for RPC node {node['host']} to start...")
233+
sleep(30) # Adjust sleep time as necessary
234+
protocol_version_hash = grep_protocol_version(client)
220235

221-
client.close()
236+
if not protocol_version_hash:
237+
logger.error(f"Failed to retrieve protocol version hash on RPC node {node['host']}")
238+
continue
222239
except Exception as e:
223-
logger.error(f"Error during update and start on {node['host']}: {e}")
240+
logger.error(f"Error handling RPC node {node['host']}: {e}")
224241
finally:
225242
if client:
226243
client.close()
227-
logger.debug(f"Closed connection for node {node['host']}")
228244

229-
# Step 3: SSH into the bootstrap node and update GENESIS_HASH and POT_EXTERNAL_ENTROPY, then start it
245+
# Step 4: Handle the bootstrap node, using the protocol version hash if available
230246
if protocol_version_hash:
231-
client = None # Initialize the client variable
232247
try:
233-
logger.info(f"Connecting to the bootstrap node {bootstrap_node['host']} for sudo docker compose down -v...")
248+
logger.info(f"Connecting to the bootstrap node {bootstrap_node['host']}...")
234249
client = ssh_connect(bootstrap_node['host'], bootstrap_node['user'], bootstrap_node['ssh_key'])
235-
236-
# Run sudo docker compose down -v for the bootstrap node
237-
docker_compose_down(client, subspace_dir)
238-
239-
# Modify .env with the new GENESIS_HASH and POT_EXTERNAL_ENTROPY
240-
modify_env_file(client, subspace_dir, release_version, genesis_hash=protocol_version_hash, pot_external_entropy=args.pot_external_entropy)
241-
242-
# Start the bootstrap node
243-
docker_compose_up(client, subspace_dir)
244-
245-
client.close()
246-
logger.info("Bootstrap node started with the updated Genesis Hash and POT_EXTERNAL_ENTROPY.")
250+
handle_node(client, bootstrap_node, args.subspace_dir, args.release_version, genesis_hash=protocol_version_hash, pot_external_entropy=args.pot_external_entropy, network=args.network, prune=args.prune, restart=args.restart)
247251
except Exception as e:
248-
logger.error(f"Error during bootstrap node update: {e}")
252+
logger.error(f"Error handling bootstrap node: {e}")
249253
finally:
250254
if client:
251255
client.close()
252256
else:
253-
logger.error("Protocol version hash not found, skipping bootstrap node start.")
257+
logger.error("Protocol version hash not found; skipping bootstrap node update.")
254258

255259
if __name__ == '__main__':
256260
main()

0 commit comments

Comments
 (0)