Skip to content

Commit c8f71f6

Browse files
committed
change -D lhs rhs to exact match
[lhs.*, rhs.*] -> [lhs.*, rhs] Remove duplicate tree walk where there are paths that have matching prefixes: a, aa, aaa, aaaa Previously, using 2 nodes: Groups: [a.*, aa.*] and [aaa.*, aaaa.*] Work done: Node 1: a, aa, aaa, aaaa Node 2: aaa, aaaa Updated gufi_distributed text to better reflect what is happening This still has the issue of potentially processing a disproportionate number of directories with similar names on the same node because they fall within the same group Instead of using -D, an alternative method may be to pass in the actual source paths at level L as input arguments. However, there is a limit to shell command size, and there is no guarantee that all paths at level L within one group will fit in one command, meaning the requested node count cannot be honored.
1 parent 92dc313 commit c8f71f6

File tree

7 files changed

+63
-62
lines changed

7 files changed

+63
-62
lines changed

include/str.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,8 +124,8 @@ typedef struct str_range {
124124
} str_range_t;
125125

126126
/*
127-
* [lhs.*, rhs.*]
128-
* prefix match
127+
* [lhs.*, rhs]
128+
* exact match
129129
*/
130130
int str_range_cmp(const str_range_t *range, const refstr_t *str);
131131
/* ****************************************************** */

scripts/gufi_distributed.py

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -97,25 +97,22 @@ def dirs_at_level(root, level):
9797
return sorted([path.decode() for path in dirs.split(b'\x00') if len(path) > 0])
9898

9999
# step 2
100-
# split directories into groups for processing
100+
# split directories into groups of unique basenames for processing
101101
def group_dirs(dirs, splits):
102-
count = len(dirs)
102+
basenames = list(set(os.path.basename(path) for path in dirs))
103+
count = len(basenames)
103104
group_size = count // splits + int(bool(count % splits))
104-
ordered = sorted(dirs, key=os.path.basename)
105-
return group_size, [ordered[i: i + group_size] for i in range(0, count, group_size)]
106-
107-
def dir_plural(count):
108-
return 'directories' if count > 1 else 'directory'
105+
ordered = sorted(basenames)
106+
return count, group_size, [ordered[i: i + group_size] for i in range(0, count, group_size)]
109107

110108
# step 3
111109
# get only the first and last paths in each group
112110
# print debug messages
113111
# run function to schedule jobs if it exists
114-
def schedule_subtrees(dir_count, splits, group_size, groups, schedule_subtree):
115-
print('Splitting {0} {1} into {2} chunks of max size {3}'.format(dir_count,
116-
dir_plural(dir_count),
117-
splits,
118-
group_size))
112+
def schedule_subtrees(unique_basenames, splits, group_size, groups, schedule_subtree):
113+
print('Splitting {0} unique basenames into {1} groups of max size {2}'.format(unique_basenames,
114+
splits,
115+
group_size))
119116

120117
jobids = []
121118
for i, group in enumerate(groups):
@@ -124,7 +121,7 @@ def schedule_subtrees(dir_count, splits, group_size, groups, schedule_subtree):
124121
if count == 0:
125122
break
126123

127-
print(' Range {0}: {1} {2}'.format(i, count, dir_plural(count)))
124+
print(' Range {0}: {1} basename{2}'.format(i, count, 's' if count != 1 else ''))
128125
print(' {0} {1}'.format(group[0], group[-1]))
129126

130127
if schedule_subtree is not None:
@@ -149,8 +146,8 @@ def schedule_top(func, jobids):
149146
# call this combined function to distribute work
150147
def distribute_work(root, level, nodes, schedule_subtree_func, schedule_top_func):
151148
dirs = dirs_at_level(root, level)
152-
group_size, groups = group_dirs(dirs, nodes)
153-
jobids = schedule_subtrees(len(dirs), nodes, group_size, groups, schedule_subtree_func)
149+
unique_basenames, group_size, groups = group_dirs(dirs, nodes)
150+
jobids = schedule_subtrees(unique_basenames, nodes, group_size, groups, schedule_subtree_func)
154151
jobids += [schedule_top(schedule_top_func, jobids).decode()]
155152
return jobids
156153

src/str.c

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -92,32 +92,23 @@ void str_free_existing(str_t *str) {
9292

9393
int str_cmp(const str_t *lhs, const str_t *rhs) {
9494
const size_t len = ((lhs->len > rhs->len)?lhs:rhs)->len;
95-
return strncmp(lhs->data, rhs->data, len);
95+
return strncmp(lhs->data, rhs->data, len + 1);
9696
}
9797

9898
int refstr_cmp(const refstr_t *lhs, const refstr_t *rhs) {
9999
const size_t len = ((lhs->len > rhs->len)?lhs:rhs)->len;
100-
return strncmp(lhs->data, rhs->data, len);
101-
}
102-
103-
/*
104-
* Compare string prefixes, so that
105-
* "hello" and "hello world" returns 0
106-
* but "hello world" and "hello" returns (int) ' '
107-
*/
108-
static int refstr_cmp_prefix(const refstr_t *prefix, const refstr_t *str) {
109-
return strncmp(prefix->data, str->data, prefix->len);
100+
return strncmp(lhs->data, rhs->data, len + 1);
110101
}
111102

112103
int str_range_cmp(const str_range_t *range, const refstr_t *str) {
113104
/* less than lhs */
114-
const int lhc = refstr_cmp_prefix(&range->lhs, str);
105+
const int lhc = refstr_cmp(&range->lhs, str);
115106
if (lhc > 0) {
116107
return -1;
117108
}
118109

119110
/* greater than rhs */
120-
const int rhc = refstr_cmp_prefix(&range->rhs, str);
111+
const int rhc = refstr_cmp(&range->rhs, str);
121112
if (rhc < 0) {
122113
return +1;
123114
}

test/regression/gufi_distributed.expected

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -4,28 +4,28 @@ $ gufi_dir2index_distributed --sbatch "sbatch" --gufi_dir2index "gufi_dir2index"
44
"search2" Already exists!
55
"search2" Already exists!
66
"search2" Already exists!
7-
Splitting 4 directories into 5 chunks of max size 1
8-
Range 0: 1 directory
9-
prefix/directory prefix/directory
10-
Range 1: 1 directory
11-
prefix/empty_directory prefix/empty_directory
12-
Range 2: 1 directory
13-
prefix/leaf_directory prefix/leaf_directory
14-
Range 3: 1 directory
15-
prefix/unusual#? directory , prefix/unusual#? directory ,
7+
Splitting 4 unique basenames into 5 groups of max size 1
8+
Range 0: 1 basename
9+
directory directory
10+
Range 1: 1 basename
11+
empty_directory empty_directory
12+
Range 2: 1 basename
13+
leaf_directory leaf_directory
14+
Range 3: 1 basename
15+
unusual#? directory , unusual#? directory ,
1616
Index upper directories up to and including level 0
1717

1818
# Query Index
1919
$ gufi_query_distributed --sbatch "sbatch" --gufi_query "gufi_query" --threads 2 1 5 "prefix2" -S "SELECT rpath(sname, sroll) FROM vrsummary;" -E "SELECT rpath(sname, sroll) || '/' || name FROM vrpentries;"
20-
Splitting 4 directories into 5 chunks of max size 1
21-
Range 0: 1 directory
22-
prefix2/directory prefix2/directory
23-
Range 1: 1 directory
24-
prefix2/empty_directory prefix2/empty_directory
25-
Range 2: 1 directory
26-
prefix2/leaf_directory prefix2/leaf_directory
27-
Range 3: 1 directory
28-
prefix2/unusual#? directory , prefix2/unusual#? directory ,
20+
Splitting 4 unique basenames into 5 groups of max size 1
21+
Range 0: 1 basename
22+
directory directory
23+
Range 1: 1 basename
24+
empty_directory empty_directory
25+
Range 2: 1 basename
26+
leaf_directory leaf_directory
27+
Range 3: 1 basename
28+
unusual#? directory , unusual#? directory ,
2929
Query upper directories up to and including level 0
3030
cat the following slurm job output files to get complete results:
3131
0
@@ -66,15 +66,15 @@ $ diff <(gufi_query -S "SELECT rpath(sname, sroll) FROM vrsummary;" -E "SELECT r
6666

6767
# Convert source tree to trace files
6868
$ gufi_dir2trace_distributed --sbatch "sbatch" --gufi_dir2trace "gufi_dir2trace" -d "|" 1 5 "prefix" "traces"
69-
Splitting 4 directories into 5 chunks of max size 1
70-
Range 0: 1 directory
71-
prefix/directory prefix/directory
72-
Range 1: 1 directory
73-
prefix/empty_directory prefix/empty_directory
74-
Range 2: 1 directory
75-
prefix/leaf_directory prefix/leaf_directory
76-
Range 3: 1 directory
77-
prefix/unusual#? directory , prefix/unusual#? directory ,
69+
Splitting 4 unique basenames into 5 groups of max size 1
70+
Range 0: 1 basename
71+
directory directory
72+
Range 1: 1 basename
73+
empty_directory empty_directory
74+
Range 2: 1 basename
75+
leaf_directory leaf_directory
76+
Range 3: 1 basename
77+
unusual#? directory , unusual#? directory ,
7878
Index upper directories up to and including level 0
7979
Index can now be created from "traces.*"
8080

test/regression/gufi_query.expected

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -254,8 +254,8 @@ prefix/leaf_directory/leaf_file2
254254
prefix/unusual#? directory ,
255255
prefix/unusual#? directory ,/unusual, name?#
256256

257-
# limit tree traversal to all directories under [a.*, e.*] at level 1 (missing prefix/, prefix/leaf_directory/, and prefix/unusual#? directory ,/
258-
$ gufi_query -d " " -n 2 -y 1 -D a e -S "SELECT rpath(sname, sroll) FROM vrsummary;" -E "SELECT rpath(sname, sroll) || '/' || name FROM vrpentries;" "prefix"
257+
# limit tree traversal to all directories under [a, f] at level 1 (missing prefix/, prefix/leaf_directory/, and prefix/unusual#? directory ,/
258+
$ gufi_query -d " " -n 2 -y 1 -D a f -S "SELECT rpath(sname, sroll) FROM vrsummary;" -E "SELECT rpath(sname, sroll) || '/' || name FROM vrpentries;" "prefix"
259259
prefix/directory
260260
prefix/directory/executable
261261
prefix/directory/readonly

test/regression/gufi_query.sh.in

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,8 +127,8 @@ run_sort "${GUFI_QUERY} -d \" \" -n ${THREADS} -S \"SELECT rpath(sname, sroll),
127127
echo "# limit tree traversal to directories at level 1 (missing prefix/ and prefix/directory/subdirectory/)"
128128
run_sort "${GUFI_QUERY} -d \" \" -n ${THREADS} -y 1 -z 1 -S \"SELECT rpath(sname, sroll) FROM vrsummary;\" -E \"SELECT rpath(sname, sroll) || '/' || name FROM vrpentries;\" \"${INDEXROOT}\""
129129

130-
echo "# limit tree traversal to all directories under [a.*, e.*] at level 1 (missing prefix/, prefix/leaf_directory/, and prefix/unusual#? directory ,/"
131-
run_sort "${GUFI_QUERY} -d \" \" -n ${THREADS} -y 1 -D a e -S \"SELECT rpath(sname, sroll) FROM vrsummary;\" -E \"SELECT rpath(sname, sroll) || '/' || name FROM vrpentries;\" \"${INDEXROOT}\""
130+
echo "# limit tree traversal to all directories under [a, f] at level 1 (missing prefix/, prefix/leaf_directory/, and prefix/unusual#? directory ,/"
131+
run_sort "${GUFI_QUERY} -d \" \" -n ${THREADS} -y 1 -D a f -S \"SELECT rpath(sname, sroll) FROM vrsummary;\" -E \"SELECT rpath(sname, sroll) || '/' || name FROM vrpentries;\" \"${INDEXROOT}\""
132132

133133
echo "# Output TLV columns (no aggregation)"
134134
run_no_sort "${GUFI_QUERY} -u -n ${THREADS} -E \"SELECT name, size FROM vrpentries WHERE name == '.hidden';\" \"${INDEXROOT}\" | ${HEXLIFY}"

test/unit/googletest/str.cpp

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,35 +109,47 @@ TEST(str, cmp) {
109109
}
110110

111111
TEST(str, range) {
112+
// before lhs
112113
refstr_t a;
113114
a.data = "a";
114115
a.len = 1;
115116

117+
// before lhs
116118
refstr_t ba;
117119
ba.data = "ba";
118120
ba.len = 2;
119121

122+
// lhs == bb
120123
refstr_t bb;
121124
bb.data = "bb";
122125
bb.len = 2;
123126

127+
// lhs < cc < rhs
124128
refstr_t cc;
125129
cc.data = "cc";
126130
cc.len = 2;
127131

132+
// rhs == 'dd'
128133
refstr_t dd;
129134
dd.data = "dd";
130135
dd.len = 2;
131136

137+
// after rhs
138+
refstr_t ddd;
139+
ddd.data = "ddd";
140+
ddd.len = 3;
141+
142+
// after rhs
132143
refstr_t de;
133144
de.data = "de";
134145
de.len = 2;
135146

147+
// after rhs
136148
refstr_t e;
137149
e.data = "e";
138150
e.len = 1;
139151

140-
// [bb.*, dd.*]
152+
// [bb, dd]
141153
str_range_t range;
142154
range.lhs = bb;
143155
range.rhs = dd;
@@ -147,6 +159,7 @@ TEST(str, range) {
147159
EXPECT_EQ(str_range_cmp(&range, &bb), 0);
148160
EXPECT_EQ(str_range_cmp(&range, &cc), 0);
149161
EXPECT_EQ(str_range_cmp(&range, &dd), 0);
162+
EXPECT_EQ(str_range_cmp(&range, &ddd), 1);
150163
EXPECT_EQ(str_range_cmp(&range, &de), 1);
151164
EXPECT_EQ(str_range_cmp(&range, &e), 1);
152165
}

0 commit comments

Comments
 (0)