Skip to content

Commit 708a633

Browse files
committed
change -D lhs rhs to exact match
[lhs.*, rhs.*] -> [lhs.*, rhs] Remove duplicate tree walk where there are paths that have matching prefixes: a, aa, aaa, aaaa Previously, using 2 nodes: Groups: [a.*, aa.*] and [aaa.*, aaaa.*] Work done: Node 1: a, aa, aaa, aaaa Node 2: aaa, aaaa Updated gufi_distributed text to better reflect what is happening This still has the issue of potentially processing many directories on the same node if many directories fall within the same group Instead of using -D, an alternative method may be to pass in the actual source paths at level L as input arguments. However, there is a limit to shell command size, and there is no guarantee that all paths at level L within one group will fit in one command, meaning the requested node count cannot be honored.
1 parent 92dc313 commit 708a633

File tree

7 files changed

+63
-62
lines changed

7 files changed

+63
-62
lines changed

include/str.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,8 +124,8 @@ typedef struct str_range {
124124
} str_range_t;
125125

126126
/*
127-
* [lhs.*, rhs.*]
128-
* prefix match
127+
* [lhs.*, rhs]
128+
* exact match
129129
*/
130130
int str_range_cmp(const str_range_t *range, const refstr_t *str);
131131
/* ****************************************************** */

scripts/gufi_distributed.py

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -97,25 +97,22 @@ def dirs_at_level(root, level):
9797
return sorted([path.decode() for path in dirs.split(b'\x00') if len(path) > 0])
9898

9999
# step 2
100-
# split directories into groups for processing
100+
# split dirs into groups of unique basenames for processing
101101
def group_dirs(dirs, splits):
102-
count = len(dirs)
102+
basenames = list(set(os.path.basename(path) for path in dirs))
103+
count = len(basenames)
103104
group_size = count // splits + int(bool(count % splits))
104-
ordered = sorted(dirs, key=os.path.basename)
105-
return group_size, [ordered[i: i + group_size] for i in range(0, count, group_size)]
106-
107-
def dir_plural(count):
108-
return 'directories' if count > 1 else 'directory'
105+
ordered = sorted(basenames)
106+
return count, group_size, [ordered[i: i + group_size] for i in range(0, count, group_size)]
109107

110108
# step 3
111109
# get only the first and last paths in each group
112110
# print debug messages
113111
# run function to schedule jobs if it exists
114-
def schedule_subtrees(dir_count, splits, group_size, groups, schedule_subtree):
115-
print('Splitting {0} {1} into {2} chunks of max size {3}'.format(dir_count,
116-
dir_plural(dir_count),
117-
splits,
118-
group_size))
112+
def schedule_subtrees(unique_basenames, splits, group_size, groups, schedule_subtree):
113+
print('Splitting {0} unique basenames into {1} groups of max size {2}'.format(unique_basenames,
114+
splits,
115+
group_size))
119116

120117
jobids = []
121118
for i, group in enumerate(groups):
@@ -124,7 +121,7 @@ def schedule_subtrees(dir_count, splits, group_size, groups, schedule_subtree):
124121
if count == 0:
125122
break
126123

127-
print(' Range {0}: {1} {2}'.format(i, count, dir_plural(count)))
124+
print(' Range {0}: {1} basename{2}'.format(i, count, 's' if count != 1 else ''))
128125
print(' {0} {1}'.format(group[0], group[-1]))
129126

130127
if schedule_subtree is not None:
@@ -149,8 +146,8 @@ def schedule_top(func, jobids):
149146
# call this combined function to distribute work
150147
def distribute_work(root, level, nodes, schedule_subtree_func, schedule_top_func):
151148
dirs = dirs_at_level(root, level)
152-
group_size, groups = group_dirs(dirs, nodes)
153-
jobids = schedule_subtrees(len(dirs), nodes, group_size, groups, schedule_subtree_func)
149+
unique_basenames, group_size, groups = group_dirs(dirs, nodes)
150+
jobids = schedule_subtrees(unique_basenames, nodes, group_size, groups, schedule_subtree_func)
154151
jobids += [schedule_top(schedule_top_func, jobids).decode()]
155152
return jobids
156153

src/str.c

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -92,32 +92,23 @@ void str_free_existing(str_t *str) {
9292

9393
int str_cmp(const str_t *lhs, const str_t *rhs) {
9494
const size_t len = ((lhs->len > rhs->len)?lhs:rhs)->len;
95-
return strncmp(lhs->data, rhs->data, len);
95+
return strncmp(lhs->data, rhs->data, len + 1);
9696
}
9797

9898
int refstr_cmp(const refstr_t *lhs, const refstr_t *rhs) {
9999
const size_t len = ((lhs->len > rhs->len)?lhs:rhs)->len;
100-
return strncmp(lhs->data, rhs->data, len);
101-
}
102-
103-
/*
104-
* Compare string prefixes, so that
105-
* "hello" and "hello world" returns 0
106-
* but "hello world" and "hello" returns (int) ' '
107-
*/
108-
static int refstr_cmp_prefix(const refstr_t *prefix, const refstr_t *str) {
109-
return strncmp(prefix->data, str->data, prefix->len);
100+
return strncmp(lhs->data, rhs->data, len + 1);
110101
}
111102

112103
int str_range_cmp(const str_range_t *range, const refstr_t *str) {
113104
/* less than lhs */
114-
const int lhc = refstr_cmp_prefix(&range->lhs, str);
105+
const int lhc = refstr_cmp(&range->lhs, str);
115106
if (lhc > 0) {
116107
return -1;
117108
}
118109

119110
/* greater than rhs */
120-
const int rhc = refstr_cmp_prefix(&range->rhs, str);
111+
const int rhc = refstr_cmp(&range->rhs, str);
121112
if (rhc < 0) {
122113
return +1;
123114
}

test/regression/gufi_distributed.expected

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -4,28 +4,28 @@ $ gufi_dir2index_distributed --sbatch "sbatch" --gufi_dir2index "gufi_dir2index"
44
"search2" Already exists!
55
"search2" Already exists!
66
"search2" Already exists!
7-
Splitting 4 directories into 5 chunks of max size 1
8-
Range 0: 1 directory
9-
prefix/directory prefix/directory
10-
Range 1: 1 directory
11-
prefix/empty_directory prefix/empty_directory
12-
Range 2: 1 directory
13-
prefix/leaf_directory prefix/leaf_directory
14-
Range 3: 1 directory
15-
prefix/unusual#? directory , prefix/unusual#? directory ,
7+
Splitting 4 unique basenames into 5 groups of max size 1
8+
Range 0: 1 basename
9+
directory directory
10+
Range 1: 1 basename
11+
empty_directory empty_directory
12+
Range 2: 1 basename
13+
leaf_directory leaf_directory
14+
Range 3: 1 basename
15+
unusual#? directory , unusual#? directory ,
1616
Index upper directories up to and including level 0
1717

1818
# Query Index
1919
$ gufi_query_distributed --sbatch "sbatch" --gufi_query "gufi_query" --threads 2 1 5 "prefix2" -S "SELECT rpath(sname, sroll) FROM vrsummary;" -E "SELECT rpath(sname, sroll) || '/' || name FROM vrpentries;"
20-
Splitting 4 directories into 5 chunks of max size 1
21-
Range 0: 1 directory
22-
prefix2/directory prefix2/directory
23-
Range 1: 1 directory
24-
prefix2/empty_directory prefix2/empty_directory
25-
Range 2: 1 directory
26-
prefix2/leaf_directory prefix2/leaf_directory
27-
Range 3: 1 directory
28-
prefix2/unusual#? directory , prefix2/unusual#? directory ,
20+
Splitting 4 unique basenames into 5 groups of max size 1
21+
Range 0: 1 basename
22+
directory directory
23+
Range 1: 1 basename
24+
empty_directory empty_directory
25+
Range 2: 1 basename
26+
leaf_directory leaf_directory
27+
Range 3: 1 basename
28+
unusual#? directory , unusual#? directory ,
2929
Query upper directories up to and including level 0
3030
cat the following slurm job output files to get complete results:
3131
0
@@ -66,15 +66,15 @@ $ diff <(gufi_query -S "SELECT rpath(sname, sroll) FROM vrsummary;" -E "SELECT r
6666

6767
# Convert source tree to trace files
6868
$ gufi_dir2trace_distributed --sbatch "sbatch" --gufi_dir2trace "gufi_dir2trace" -d "|" 1 5 "prefix" "traces"
69-
Splitting 4 directories into 5 chunks of max size 1
70-
Range 0: 1 directory
71-
prefix/directory prefix/directory
72-
Range 1: 1 directory
73-
prefix/empty_directory prefix/empty_directory
74-
Range 2: 1 directory
75-
prefix/leaf_directory prefix/leaf_directory
76-
Range 3: 1 directory
77-
prefix/unusual#? directory , prefix/unusual#? directory ,
69+
Splitting 4 unique basenames into 5 groups of max size 1
70+
Range 0: 1 basename
71+
directory directory
72+
Range 1: 1 basename
73+
empty_directory empty_directory
74+
Range 2: 1 basename
75+
leaf_directory leaf_directory
76+
Range 3: 1 basename
77+
unusual#? directory , unusual#? directory ,
7878
Index upper directories up to and including level 0
7979
Index can now be created from "traces.*"
8080

test/regression/gufi_query.expected

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -254,8 +254,8 @@ prefix/leaf_directory/leaf_file2
254254
prefix/unusual#? directory ,
255255
prefix/unusual#? directory ,/unusual, name?#
256256

257-
# limit tree traversal to all directories under [a.*, e.*] at level 1 (missing prefix/, prefix/leaf_directory/, and prefix/unusual#? directory ,/
258-
$ gufi_query -d " " -n 2 -y 1 -D a e -S "SELECT rpath(sname, sroll) FROM vrsummary;" -E "SELECT rpath(sname, sroll) || '/' || name FROM vrpentries;" "prefix"
257+
# limit tree traversal to all directories under [a, f] at level 1 (missing prefix/, prefix/leaf_directory/, and prefix/unusual#? directory ,/
258+
$ gufi_query -d " " -n 2 -y 1 -D a f -S "SELECT rpath(sname, sroll) FROM vrsummary;" -E "SELECT rpath(sname, sroll) || '/' || name FROM vrpentries;" "prefix"
259259
prefix/directory
260260
prefix/directory/executable
261261
prefix/directory/readonly

test/regression/gufi_query.sh.in

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,8 +127,8 @@ run_sort "${GUFI_QUERY} -d \" \" -n ${THREADS} -S \"SELECT rpath(sname, sroll),
127127
echo "# limit tree traversal to directories at level 1 (missing prefix/ and prefix/directory/subdirectory/)"
128128
run_sort "${GUFI_QUERY} -d \" \" -n ${THREADS} -y 1 -z 1 -S \"SELECT rpath(sname, sroll) FROM vrsummary;\" -E \"SELECT rpath(sname, sroll) || '/' || name FROM vrpentries;\" \"${INDEXROOT}\""
129129

130-
echo "# limit tree traversal to all directories under [a.*, e.*] at level 1 (missing prefix/, prefix/leaf_directory/, and prefix/unusual#? directory ,/"
131-
run_sort "${GUFI_QUERY} -d \" \" -n ${THREADS} -y 1 -D a e -S \"SELECT rpath(sname, sroll) FROM vrsummary;\" -E \"SELECT rpath(sname, sroll) || '/' || name FROM vrpentries;\" \"${INDEXROOT}\""
130+
echo "# limit tree traversal to all directories under [a, f] at level 1 (missing prefix/, prefix/leaf_directory/, and prefix/unusual#? directory ,/"
131+
run_sort "${GUFI_QUERY} -d \" \" -n ${THREADS} -y 1 -D a f -S \"SELECT rpath(sname, sroll) FROM vrsummary;\" -E \"SELECT rpath(sname, sroll) || '/' || name FROM vrpentries;\" \"${INDEXROOT}\""
132132

133133
echo "# Output TLV columns (no aggregation)"
134134
run_no_sort "${GUFI_QUERY} -u -n ${THREADS} -E \"SELECT name, size FROM vrpentries WHERE name == '.hidden';\" \"${INDEXROOT}\" | ${HEXLIFY}"

test/unit/googletest/str.cpp

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,35 +109,47 @@ TEST(str, cmp) {
109109
}
110110

111111
TEST(str, range) {
112+
// before lhs
112113
refstr_t a;
113114
a.data = "a";
114115
a.len = 1;
115116

117+
// before lhs
116118
refstr_t ba;
117119
ba.data = "ba";
118120
ba.len = 2;
119121

122+
// lhs == bb
120123
refstr_t bb;
121124
bb.data = "bb";
122125
bb.len = 2;
123126

127+
// lhs < cc < rhs
124128
refstr_t cc;
125129
cc.data = "cc";
126130
cc.len = 2;
127131

132+
// rhs == 'dd'
128133
refstr_t dd;
129134
dd.data = "dd";
130135
dd.len = 2;
131136

137+
// after rhs
138+
refstr_t ddd;
139+
ddd.data = "ddd";
140+
ddd.len = 3;
141+
142+
// after rhs
132143
refstr_t de;
133144
de.data = "de";
134145
de.len = 2;
135146

147+
// after rhs
136148
refstr_t e;
137149
e.data = "e";
138150
e.len = 1;
139151

140-
// [bb.*, dd.*]
152+
// [bb, dd]
141153
str_range_t range;
142154
range.lhs = bb;
143155
range.rhs = dd;
@@ -147,6 +159,7 @@ TEST(str, range) {
147159
EXPECT_EQ(str_range_cmp(&range, &bb), 0);
148160
EXPECT_EQ(str_range_cmp(&range, &cc), 0);
149161
EXPECT_EQ(str_range_cmp(&range, &dd), 0);
162+
EXPECT_EQ(str_range_cmp(&range, &ddd), 1);
150163
EXPECT_EQ(str_range_cmp(&range, &de), 1);
151164
EXPECT_EQ(str_range_cmp(&range, &e), 1);
152165
}

0 commit comments

Comments
 (0)