Skip to content

Commit b78cb41

Browse files
committed
gufi_distributed cleanup
add sorting directories found by find(1) randomly try to spread consecutive large subtrees across multiple targets ignore final slurm process stdout print error messages when jobs fail
1 parent fe1acff commit b78cb41

File tree

4 files changed

+48
-2
lines changed

4 files changed

+48
-2
lines changed

scripts/gufi_distributed.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@
6363

6464
import argparse
6565
import os
66+
import random
6667
import shlex
6768
import subprocess
6869
import sys
@@ -72,9 +73,10 @@
7273

7374
# how to sort paths found by find(1) at given level
7475
SORT_DIRS = {
75-
'unsorted' : lambda dirs : dirs,
76+
'unsorted' : lambda dirs : dirs, # whatever order find(1) printed in
7677
'path' : lambda dirs : sorted(dirs), # pylint: disable=unnecessary-lambda
7778
'basename' : lambda dirs : sorted(dirs, key = os.path.basename),
79+
'random' : lambda dirs : random.sample(dirs, len(dirs)),
7880
}
7981

8082
# wait on sbatch, not the actual job
@@ -86,6 +88,7 @@ def run_slurm(args, target, cmd):
8688
out, _ = proc.communicate()
8789

8890
if proc.returncode != 0:
91+
sys.stderr.write('slurm job to {0} failed with error code {1}\n', target, proc.returncode)
8992
return None
9093

9194
return out.split()[-1].decode()
@@ -99,7 +102,7 @@ def handle_slurm_procs(args, jobids):
99102
['--dependency', 'after:' + after] if len(after) != 0 else [] +
100103
['--wait', '/dev/stdin'],
101104
stdin=subprocess.PIPE,
102-
stdout=subprocess.DEVNULL, # Python 3.3
105+
stdout=subprocess.PIPE, # not used
103106
shell=True)
104107

105108
# split shebang so shellcheck script doesn't see this
@@ -122,6 +125,8 @@ def handle_ssh_procs(_args, procs):
122125
proc.wait()
123126

124127
if proc.returncode != 0:
128+
sys.stderr.write('ssh to {0} failed with error code {1}\n'.format(proc.args[1], # args[1] only works because there are no arguments between ssh and the target
129+
proc.returncode))
125130
continue
126131

127132
jobids += [proc.pid]

test/regression/gufi_distributed.expected

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,20 @@ Using existing files
116116
Process upper directories up to and including level 0 on 127.0.0.1
117117
Waiting for slurm jobs to complete
118118

119+
# sbatch succeeded but jobs failed (errors are not printed - need to check output files)
120+
$ gufi_dir2index_distributed --sbatch "sbatch" --gufi_dir2index "false" slurm "hostfile" 1 "prefix" "search2"
121+
Splitting 4 paths into 5 groups of max size 1
122+
Range 0: 1 path on localhost
123+
directory directory
124+
Range 1: 1 path on 127.0.0.1
125+
empty_directory empty_directory
126+
Range 2: 1 path on localhost
127+
leaf_directory leaf_directory
128+
Range 3: 1 path on 127.0.0.1
129+
unusual#? directory , unusual#? directory ,
130+
Process upper directories up to and including level 0 on 127.0.0.1
131+
Waiting for slurm jobs to complete
132+
119133
#####################################
120134

121135
#####################################
@@ -231,4 +245,23 @@ Using existing files
231245
Process upper directories up to and including level 0 on 127.0.0.1
232246
Waiting for ssh jobs to complete
233247

248+
# ssh succeeded but jobs failed (prints errors)
249+
$ gufi_dir2index_distributed --ssh "ssh" --gufi_dir2index "false" ssh "hostfile" 1 "prefix" "search2"
250+
ssh to localhost failed with error code 1
251+
ssh to 127.0.0.1 failed with error code 1
252+
ssh to localhost failed with error code 1
253+
ssh to 127.0.0.1 failed with error code 1
254+
ssh to 127.0.0.1 failed with error code 1
255+
Splitting 4 paths into 5 groups of max size 1
256+
Range 0: 1 path on localhost
257+
directory directory
258+
Range 1: 1 path on 127.0.0.1
259+
empty_directory empty_directory
260+
Range 2: 1 path on localhost
261+
leaf_directory leaf_directory
262+
Range 3: 1 path on 127.0.0.1
263+
unusual#? directory , unusual#? directory ,
264+
Process upper directories up to and including level 0 on 127.0.0.1
265+
Waiting for ssh jobs to complete
266+
234267
#####################################

test/regression/gufi_distributed.sh.in

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,9 @@ run_no_sort "@DIFF@ <(${GUFI_QUERY} -d \" \" -S \"SELECT rpath(sname, sroll) FRO
213213
echo "# Use existing group files (path_list.4 does not exist)"
214214
run_no_sort "${GUFI_DIR2INDEX_DISTRIBUTED} --sbatch \"${SBATCH_FAKE}\" --gufi_dir2index \"${GUFI_DIR2INDEX}\" --use-existing-group-files slurm \"${HOSTFILE}\" ${DISTRIBUTED_LEVEL} \"${SRCDIR}\" \"${SEARCH2}\" 2>/dev/null"
215215

216+
echo "# sbatch succeeded but jobs failed (errors are not printed - need to check output files)"
217+
run_no_sort "${GUFI_DIR2INDEX_DISTRIBUTED} --sbatch \"${SBATCH_FAKE}\" --gufi_dir2index \"${FALSE}\" slurm \"${HOSTFILE}\" ${DISTRIBUTED_LEVEL} \"${SRCDIR}\" \"${SEARCH2}\""
218+
216219
# cleanup
217220
cleanup
218221
echo "#####################################"
@@ -255,6 +258,9 @@ run_no_sort "@DIFF@ <(${GUFI_QUERY} -d \" \" -S \"SELECT rpath(sname, sroll) FRO
255258

256259
echo "# Use existing group files (path_list.4 does not exist)"
257260
run_no_sort "${GUFI_DIR2INDEX_DISTRIBUTED} --ssh \"${SSH_FAKE}\" --gufi_dir2index \"${GUFI_DIR2INDEX}\" --use-existing-group-files ssh \"${HOSTFILE}\" ${DISTRIBUTED_LEVEL} \"${SRCDIR}\" \"${SEARCH2}\" 2>/dev/null | tail -n 9"
261+
262+
echo "# ssh succeeded but jobs failed (prints errors)"
263+
run_no_sort "${GUFI_DIR2INDEX_DISTRIBUTED} --ssh \"${SSH_FAKE}\" --gufi_dir2index \"${FALSE}\" ssh \"${HOSTFILE}\" ${DISTRIBUTED_LEVEL} \"${SRCDIR}\" \"${SEARCH2}\""
258264
echo "#####################################"
259265
) 2>&1 | remove_distributed_output | tee "${OUTPUT}"
260266

test/regression/setup.sh.in

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,7 @@ VERIFYTRACEINTREE="@CMAKE_BINARY_DIR@/contrib/verifytraceintree"
192192
AWK="@AWK@"
193193
COLUMN="@COLUMN@"
194194
DIFF="@DIFF@"
195+
FALSE="$(which false)"
195196
GREP="@GREP@"
196197
SED="@SED@"
197198
STAT="@STAT@"
@@ -276,6 +277,7 @@ replace() {
276277
s/${AWK//\//\\/}/awk/g;
277278
s/${COLUMN//\//\\/}/column/g;
278279
s/${DIFF//\//\\/}/diff/g;
280+
s/${FALSE//\//\\/}/false/g;
279281
s/${GREP//\//\\/}/grep/g;
280282
s/${SED//\//\\/}/sed/g;
281283
s/${STAT//\//\\/}/stat/g;

0 commit comments

Comments
 (0)