Skip to content

Commit 312567d

Browse files
authored
Merge pull request #299 from intel/develop
Develop
2 parents a3a2105 + 4d71d70 commit 312567d

File tree

10 files changed

+107
-67
lines changed

10 files changed

+107
-67
lines changed

src/common/common_utils.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ namespace yask {
4747
// https://semver.org/.
4848

4949
// Format: "major.minor.patch[-alpha|-beta]".
50-
const string version = "4.05.03";
50+
const string version = "4.05.04";
5151

5252
string yask_get_version_string() {
5353
return version;

src/kernel/Makefile

+35-28
Original file line numberDiff line numberDiff line change
@@ -372,7 +372,7 @@ ifeq ($(cxx_is_llvm_intel),1)
372372
-fimf-precision=low -fp-model fast -fimf-domain-exclusion=none -fma
373373
YK_CXXWARN2 += -Wno-unknown-pragmas -Wno-unused-variable -Wno-unused-but-set-variable \
374374
-Wno-unused-const-variable -fno-color-diagnostics
375-
OMPFLAG := -fiopenmp
375+
OMPFLAG := -qopenmp
376376
SWIG_CXXFLAGS += -Wno-deprecated-declarations
377377
MACROS += INTEL_OMP
378378
VEC_MACROS += NO_PRAGMA_VEC2
@@ -881,7 +881,6 @@ help:
881881
echo " $(MAKE) clean; $(MAKE) -j stencil=ssg YK_CXXOPT='-O2' # Use O2 optimization"; \
882882
echo " $(MAKE) clean; $(MAKE) -j stencil=ssg YK_CXX=icpc # Use classic Intel compiler"; \
883883
echo " $(MAKE) clean; $(MAKE) -j stencil=ssg YK_CXX=g++ # Use gnu compiler"; \
884-
echo " $(MAKE) clean; $(MAKE) -j stencil=ssg MPI_CXX=mpiCC # Specify MPI compiler"; \
885884
echo " "
886885
@echo "Example builds of kernel API for C++ and Python apps:"; \
887886
echo " $(MAKE) clean; $(MAKE) -j stencil=iso3dfd yk-api"; \
@@ -910,21 +909,29 @@ help:
910909
first_test := 0
911910
last_test := 999
912911

912+
# Default regex for stencil to run.
913+
test_regex := .
914+
913915
TEST_MAKE_ARGS := real_bytes=8 use_rcp=0 allow_new_var_types=0 trace=1
914916
TEST_MAKE := $(MAKE) $(TEST_MAKE_ARGS)
915917

916-
# Define makefile functions for folding.
917918
# Set default threads.
918-
# Disable folding and checking for offload testing.
919+
# Enable checking for CPU testing.
919920
ifeq ($(offload),1)
920-
FOLD =
921921
outer_threads := 2
922922
inner_threads := 2
923923
else
924-
TEST_MAKE_ARGS += check=1
925-
FOLD = fold=$(subst $(space),$(comma),$(1))
926924
outer_threads := 8
927925
inner_threads := 2
926+
TEST_MAKE_ARGS += check=1
927+
endif
928+
929+
# Define makefile functions for folding.
930+
# Disable folding for non-vectorized arch.
931+
ifeq ($(arch),intel64)
932+
FOLD =
933+
else
934+
FOLD = fold=$(subst $(space),$(comma),$(1))
928935
endif
929936

930937
### Unit tests.
@@ -1041,7 +1048,7 @@ test_args10 := $(DEF_MPI_TEST_ARGS) -l 64 -b 24 -mb 16 -bt 2 -no-use_shm -overla
10411048
test_args11 := $(DEF_MPI_TEST_ARGS) -l 64 -b 24 -mb 16 -bt 2 -use_shm -no-overlap_comms $(EXTRA_TEST_ARGS)
10421049
endif
10431050

1044-
# Run the kernel binary using several combos of sizes and ranks.
1051+
# Run the kernel binary using the test args defined above.
10451052
yk-tests:
10461053
if (( $(first_test) <= 0 && $(last_test) >= 0 )); then $(YK_SCRIPT) $(test_args0); fi
10471054
if (( $(first_test) <= 1 && $(last_test) >= 1 )); then $(YK_SCRIPT) $(test_args1); fi
@@ -1057,15 +1064,19 @@ yk-mpi-tests:
10571064
# Run the default YASK compiler and kernel.
10581065
# First run on 1 rank, then multiple ranks if ranks>1.
10591066
# This is the primary target for building and running stencil tests.
1060-
yc-and-yk-test: $(YK_EXEC) $(YK_SCRIPT)
1061-
$(MAKE) ranks=1 yk-tests
1062-
if (( $(ranks) > 1 )); then $(MAKE) yk-tests yk-mpi-tests; fi
1067+
yc-and-yk-test: $(YK_SCRIPT)
1068+
@ echo "Running tests that match regex '$(test_regex)' numbered from $(first_test) to $(last_test)..."
1069+
if [[ $(stencil) =~ $(test_regex) ]]; then \
1070+
$(MAKE) $(YK_EXEC) && \
1071+
$(MAKE) ranks=1 yk-tests && \
1072+
if (( $(ranks) > 1 )); then $(MAKE) yk-tests yk-mpi-tests; fi; \
1073+
fi
10631074
STENCIL_TEST := $(TEST_MAKE) yc-and-yk-test
10641075

10651076
# Run the YASK kernel test without implicity using the YASK compiler.
10661077
yk-test-no-yc: kernel-no-yc $(YK_SCRIPT)
10671078
$(MAKE) ranks=1 yk-tests
1068-
if (( $(ranks) > 1 )); then $(MAKE) yk-tests yk-mpi-tests; fi
1079+
(( $(ranks) > 1 )) && $(MAKE) yk-tests yk-mpi-tests
10691080

10701081
# Run the kernel API tests for C++ and Python with and w/o expected exceptions.
10711082
api-tests:
@@ -1159,20 +1170,6 @@ single-stencil-tests:
11591170
4d-tests:
11601171
$(MAKE) clean; $(STENCIL_TEST) stencil=test_4d $(call FOLD,w=2 x=2)
11611172

1162-
# Selected collections from above for testing specific features.
1163-
scratch-tests:
1164-
$(MAKE) clean; $(STENCIL_TEST) stencil=test_scratch_1d $(call FOLD,x=4)
1165-
$(MAKE) clean; $(STENCIL_TEST) stencil=test_scratch_stages_1d $(call FOLD,x=4)
1166-
$(MAKE) clean; $(STENCIL_TEST) stencil=test_scratch_boundary_1d $(call FOLD,x=4)
1167-
$(MAKE) clean; $(STENCIL_TEST) stencil=test_scratch_2d $(call FOLD,x=2 y=2)
1168-
$(MAKE) clean; $(STENCIL_TEST) stencil=test_scratch_3d $(call FOLD,x=2 z=2) inner_loop_dim=x
1169-
1170-
boundary-tests:
1171-
$(MAKE) clean; $(STENCIL_TEST) stencil=test_boundary_1d $(call FOLD,x=4)
1172-
$(MAKE) clean; $(STENCIL_TEST) stencil=test_scratch_boundary_1d $(call FOLD,x=4)
1173-
$(MAKE) clean; $(STENCIL_TEST) stencil=test_boundary_2d $(call FOLD,x=2 y=2)
1174-
$(MAKE) clean; $(STENCIL_TEST) stencil=test_boundary_3d $(call FOLD,x=2 y=2) inner_loop_dim=1
1175-
11761173
# The standard set of stencils to test.
11771174
stencil-tests:
11781175
$(MAKE) 1d-tests
@@ -1184,6 +1181,16 @@ stencil-tests:
11841181
if (( $(offload) == 0 )); then $(MAKE) 3d-tests4; fi
11851182
$(MAKE) 4d-tests
11861183

1184+
# Pre-defined feature tests.
1185+
scratch-tests:
1186+
$(MAKE) stencil-tests test_regex=scratch
1187+
1188+
boundary-tests:
1189+
$(MAKE) stencil-tests test_regex=boundary
1190+
1191+
stages-tests:
1192+
$(MAKE) stencil-tests test_regex=stages
1193+
11871194
unit-tests:
11881195
$(MAKE) clean; $(MAKE) cxx-yk-omp-test
11891196
$(MAKE) clean; $(MAKE) cxx-yk-var-test stencil=test_3d $(call FOLD,x=2 y=2)
@@ -1193,9 +1200,9 @@ all-tests:
11931200
$(MAKE) api-tests
11941201
$(MAKE) stencil-tests
11951202

1196-
# Install the script.
1203+
# Install the scripts.
11971204
# Then, build and run all the tests.
11981205
all:
1199-
$(MAKE) script
1206+
$(MAKE) scripts
12001207
$(MAKE) all-tests
12011208

src/kernel/lib/settings.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -365,7 +365,8 @@ namespace yask {
365365
parser.add_option(make_shared<command_line_parser::bool_option>
366366
("allow_addl_padding",
367367
"[Advanced] Allow automatic extension of padding"
368-
" beyond minimal vector alignment on any or all YASK vars.",
368+
" beyond minimal vector alignment on any or all YASK vars"
369+
" based on internal heuristics.",
369370
_allow_addl_pad));
370371
#ifdef USE_MPI
371372
_add_domain_option(parser, "nr", "Num ranks", _num_ranks);

src/kernel/lib/settings.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,7 @@ namespace yask {
264264

265265
// Var behavior, including allocation.
266266
bool _step_wrap = false; // Allow invalid step indices to alias to valid ones (set via APIs only).
267-
bool _allow_addl_pad = true; // Allow extending padding beyond what's needed for alignment.
267+
bool _allow_addl_pad = false;
268268
#ifdef USE_OFFLOAD
269269
bool _bundle_allocs = false;
270270
#else

src/kernel/lib/setup.cpp

+5
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,11 @@ namespace yask {
120120
#ifdef USE_OFFLOAD
121121
_omp_hostn = omp_get_initial_device();
122122
_omp_devn = omp_get_default_device();
123+
124+
// Heuristic to assign GPU n to rank n on this node.
125+
// Assumes shm is local to a node.
126+
if (my_rank > 0 && omp_get_num_devices() > my_shm_rank)
127+
_omp_devn = my_shm_rank;
123128
#endif
124129

125130
#else

src/kernel/lib/soln_apis.cpp

+7-1
Original file line numberDiff line numberDiff line change
@@ -151,8 +151,14 @@ namespace yask {
151151
reset_auto_tuner(actl_opts->_do_auto_tune, false);
152152

153153
// Report ranks.
154+
#ifdef USE_MPI
154155
DEBUG_MSG("\nNum MPI ranks: " << env->get_num_ranks() <<
155-
"\nThis MPI rank index: " << env->get_rank_index());
156+
"\nThis MPI rank index: " << env->get_rank_index() <<
157+
"\nNum shm-group MPI ranks: " << env->num_shm_ranks <<
158+
"\nThis shm-group MPI rank: " << env->my_shm_rank);
159+
#else
160+
DEBUG_MSG("\nMPI not supported in this binary");
161+
#endif
156162

157163
// report threads.
158164
{

src/kernel/lib/stencil_calc.hpp

+6-5
Original file line numberDiff line numberDiff line change
@@ -690,7 +690,7 @@ namespace yask {
690690
// Full rectilinear polytope of aligned vecs.
691691
else {
692692
TRACE_MSG("calculating vecs within "
693-
"normalized local indices " <<
693+
"*normalized* local indices " <<
694694
norm_fvidxs.make_range_str(true) <<
695695
" via outer thread " << outer_thread_idx <<
696696
" and inner thread " << inner_thread_idx);
@@ -712,15 +712,16 @@ namespace yask {
712712
sb_fvidxs.make_range_str(true) <<
713713
" via outer thread " << outer_thread_idx <<
714714
" and inner thread " << inner_thread_idx);
715-
#if VPTS == 1
715+
#if VLEN == 1
716716
THROW_YASK_EXCEPTION("(internal fault) vector border-code not expected with vec-size==1");
717717
#else
718718

719719
// Normalized vector indices.
720720
auto norm_ovidxs = normalize_indices(sb_ovidxs);
721721

722-
// Need to find range in each border part.
723-
// 2D example w/4 edges and 4 corners:
722+
// Need to find range in each border part. 2D example w/4
723+
// edges and 4 corners:
724+
//
724725
// +---+------+---+
725726
// | lx| |rx |
726727
// | ly| ly |ly |
@@ -839,7 +840,7 @@ namespace yask {
839840
if (pv_needed) {
840841
TRACE_MSG("calculating partial vectors with mask 0x" <<
841842
std::hex << pv_mask << std::dec << " for " << descr <<
842-
" within normalized local indices " <<
843+
" within *normalized* local indices " <<
843844
pv_part.make_range_str(true) <<
844845
" via outer thread " << outer_thread_idx <<
845846
" and inner thread " << inner_thread_idx);

src/kernel/lib/yk_var.cpp

+28-19
Original file line numberDiff line numberDiff line change
@@ -255,8 +255,10 @@ namespace yask {
255255
// Adjust padding only for domain dims.
256256
if (_domain_dim_mask & mbit) {
257257

258-
// Rounding should use soln vec lengths in case
259-
// this var is not vectorized.
258+
// Use soln vec len for rounding to allow reading a non-vec
259+
// var in this dim while calculating a vec var. (The var
260+
// vec-len is always 1 or the same as the soln vec-len in a
261+
// given dim.)
260262
auto svl = _corep->_soln_vec_lens[i];
261263

262264
// Add more padding requested by options or APIs.
@@ -265,31 +267,38 @@ namespace yask {
265267
new_left_pads[i] = max(new_left_pads[i], _corep->_req_left_pads[i]);
266268
new_right_pads[i] = max(new_right_pads[i], _corep->_req_right_pads[i]);
267269

268-
// Round left pad up to vec len.
270+
// Round left pad up to soln vec len.
269271
new_left_pads[i] = ROUND_UP(new_left_pads[i], svl);
270272

271-
// Round domain + right pad up to soln vec len by extending right pad.
272-
// Using soln vec len to allow reading a non-vec var in this dim
273-
// while calculating a vec var. (The var vec-len is always 1 or the same
274-
// as the soln vec-len in a given dim.)
275-
idx_t dprp = ROUND_UP(_corep->_domains[i] + new_right_pads[i], svl);
276-
277-
// Calculate pads from overall domain + right pad.
278-
new_right_pads[i] = dprp - _corep->_domains[i];
279-
280-
// Add yet another vec to both sides. This allows full-vector reads;
281-
// only writes are masked.
273+
// Sum of rounded-up domain and rounded right pad.
274+
idx_t rdpp = ROUND_UP(_corep->_domains[i] + new_right_pads[i], svl);
275+
276+
// Subtract domain size back out to get desired right pad.
277+
new_right_pads[i] = rdpp - _corep->_domains[i];
278+
279+
// When vec len > 1, add extra vecs to accommodate
280+
// mis-alignment and extra calculations
281+
//
282+
// Example:
283+
// ... +-------+-+ Last full vec and partial vec domain,
284+
// ... +-------+-+---+ so minimal halo is within 1-vec pad.
285+
// ... +-------+-------+ But full vecs actually calc'd,
286+
// ... +-------+-------+---+ so halo reads are needed beyond that.
287+
// ... +-------+-------+---+---+ Rounded up for alloc.
288+
#if VLEN > 1
282289
new_left_pads[i] += svl;
283290
new_right_pads[i] += svl;
291+
#endif
284292

285-
// Make inner dim an odd number of vecs.
293+
// Make inner dim an odd number of vecs when allowed.
286294
// This reportedly helps avoid some uarch aliasing.
287-
auto na = new_left_pads[i] + _corep->_domains[i] + new_right_pads[i];
295+
// Only add this optional vector if not already allocated.
288296
if (!p &&
289297
actl_opts->_allow_addl_pad &&
290-
get_dim_name(i) == inner_layout_dim &&
291-
(na / svl) % 2 == 0) {
292-
new_right_pads[i] += svl;
298+
get_dim_name(i) == inner_layout_dim) {
299+
auto na = new_left_pads[i] + _corep->_domains[i] + new_right_pads[i];
300+
if ((na / svl) % 2 == 0)
301+
new_right_pads[i] += svl;
293302
}
294303

295304
// If storage is allocated, get max of existing pad & new

src/kernel/yask.sh

+15-9
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ while true; do
172172
echo " Run YASK executable as an argument to <command>, e.g., 'numactl -N 0'."
173173
echo " -mpi_cmd <command>"
174174
echo " Run YASK executable as an argument to <command>, e.g., 'mpiexec.hydra -n 4'."
175-
echo " If -mpi_cmd and -exe_prefix are both specified, this one is used first."
175+
echo " If -mpi_cmd and -exe_prefix are both specified, this one is applied first."
176176
echo " The default command is based on the number of nodes and ranks (see below)."
177177
echo " -force_mpi"
178178
echo " Generate a default 'mpirun' prefix even if there is only 1 rank to run."
@@ -186,7 +186,6 @@ while true; do
186186
echo " This value, along with the number of nodes, <N>, is used to set these defaults:"
187187
echo " - Number of MPI ranks per node to <R>/<N>."
188188
echo " - Number of OpenMP threads per rank based on core count (for CPU kernels only)."
189-
echo " - Default MPI command to 'mpirun -np <R> -ppn <R>/<N>'."
190189
echo " If a different MPI command is needed, use -mpi_cmd <command> explicitly."
191190
echo " If the env var SLURM_NTASKS is set AND if it greater than the number of nodes,"
192191
echo " the default is its value."
@@ -389,15 +388,22 @@ fi
389388
# Set MPI command default.
390389
ppn=$(( $nranks / $nnodes ))
391390
if [[ $nranks > 1 || $force_mpi == 1 ]]; then
392-
: ${mpi_cmd="mpirun -np $nranks -ppn $ppn"}
393391

394-
# Add default Intel MPI settings.
395-
envs+=" I_MPI_PRINT_VERSION=1 I_MPI_DEBUG=5"
392+
if [[ $arch_offload =~ "nv" ]]; then
393+
: ${mpi_cmd="mpirun -np $nranks --oversubscribe"}
396394

397-
# Add NUMA pinning if number of discovered NUMA nodes
398-
# equals what is being used.
399-
if [[ -n "$nnumas" && $nnumas == $ppn ]]; then
400-
envs+=" I_MPI_PIN_DOMAIN=numa"
395+
else
396+
: ${mpi_cmd="mpirun -np $nranks -ppn $ppn"}
397+
398+
# Add default Intel MPI settings.
399+
# These will be ignored if Intel MPI isn't used.
400+
envs+=" I_MPI_PRINT_VERSION=1 I_MPI_DEBUG=5"
401+
402+
# Add NUMA pinning if number of discovered NUMA nodes
403+
# equals what is being used.
404+
if [[ -n "$nnumas" && $nnumas == $ppn ]]; then
405+
envs+=" I_MPI_PIN_DOMAIN=numa"
406+
fi
401407
fi
402408

403409
# Check whether HBM policy setting is allowed.

utils/bin/yask_log_to_csv.pl

+7-2
Original file line numberDiff line numberDiff line change
@@ -45,15 +45,20 @@
4545

4646
# Header.
4747
YaskUtils::printCsvHeader($outFH);
48-
print $outFH ",log file\n";
48+
print $outFH ",date & time,log file\n";
4949

5050
# Values from files.
5151
for my $arg (@ARGV) {
5252
for my $fn (glob $arg) {
5353
my %results;
5454
YaskUtils::getResultsFromFile(\%results, $fn);
5555

56+
my $datestr = "";
57+
if ($fn =~ /(\d{4})-(\d{2})-(\d{2})_(\d{2})-(\d{2})-(\d{2})/) {
58+
$datestr = "$2/$3/$1 $4:$5:$6"; # format for Excel.
59+
}
60+
5661
YaskUtils::printCsvValues(\%results, $outFH);
57-
print $outFH ",\"$fn\"\n";
62+
print $outFH ",\"$datestr\",\"$fn\"\n";
5863
}
5964
}

0 commit comments

Comments
 (0)