Merge pull request #299 from intel/develop

chuckyount · web-flow · commit 312567d4f4ad · 2024-07-18T11:37:01.000-07:00
Develop
diff --git a/src/common/common_utils.cpp b/src/common/common_utils.cpp
@@ -47,7 +47,7 @@ namespace yask {
     // https://semver.org/.
 
     // Format: "major.minor.patch[-alpha|-beta]".
-    const string version = "4.05.03";
+    const string version = "4.05.04";
 
     string yask_get_version_string() {
         return version;
diff --git a/src/kernel/Makefile b/src/kernel/Makefile
@@ -372,7 +372,7 @@ ifeq ($(cxx_is_llvm_intel),1)
 			-fimf-precision=low -fp-model fast -fimf-domain-exclusion=none -fma
  YK_CXXWARN2	+=	-Wno-unknown-pragmas -Wno-unused-variable -Wno-unused-but-set-variable \
 			-Wno-unused-const-variable -fno-color-diagnostics
- OMPFLAG	:=	-fiopenmp
+ OMPFLAG	:=	-qopenmp
  SWIG_CXXFLAGS	+=	-Wno-deprecated-declarations
  MACROS		+=	INTEL_OMP
  VEC_MACROS	+=	NO_PRAGMA_VEC2
@@ -881,7 +881,6 @@ help:
 	echo " $(MAKE) clean; $(MAKE) -j stencil=ssg YK_CXXOPT='-O2' # Use O2 optimization"; \
 	echo " $(MAKE) clean; $(MAKE) -j stencil=ssg YK_CXX=icpc   # Use classic Intel compiler"; \
 	echo " $(MAKE) clean; $(MAKE) -j stencil=ssg YK_CXX=g++    # Use gnu compiler"; \
-	echo " $(MAKE) clean; $(MAKE) -j stencil=ssg MPI_CXX=mpiCC # Specify MPI compiler"; \
 	echo " "
 	@echo "Example builds of kernel API for C++ and Python apps:"; \
 	echo " $(MAKE) clean; $(MAKE) -j stencil=iso3dfd yk-api"; \
@@ -910,21 +909,29 @@ help:
 first_test	:=	0
 last_test	:=	999
 
+# Default regex for stencil to run.
+test_regex	:=	.
+
 TEST_MAKE_ARGS	:=	real_bytes=8 use_rcp=0 allow_new_var_types=0 trace=1
 TEST_MAKE 	:=	$(MAKE) $(TEST_MAKE_ARGS)
 
-# Define makefile functions for folding.
 # Set default threads.
-# Disable folding and checking for offload testing.
+# Enable checking for CPU testing.
 ifeq ($(offload),1)
-FOLD		=
 outer_threads	:=	2
 inner_threads	:=	2
 else
-TEST_MAKE_ARGS	+=	check=1
-FOLD		=	fold=$(subst $(space),$(comma),$(1))
 outer_threads	:=	8
 inner_threads	:=	2
+TEST_MAKE_ARGS	+=	check=1
+endif
+
+# Define makefile functions for folding.
+# Disable folding for non-vectorized arch.
+ifeq ($(arch),intel64)
+FOLD		=
+else
+FOLD		=	fold=$(subst $(space),$(comma),$(1))
 endif
 
 ### Unit tests.
@@ -1041,7 +1048,7 @@ test_args10	:=	$(DEF_MPI_TEST_ARGS) -l 64 -b 24 -mb 16 -bt 2 -no-use_shm -overla
 test_args11	:=	$(DEF_MPI_TEST_ARGS) -l 64 -b 24 -mb 16 -bt 2 -use_shm -no-overlap_comms $(EXTRA_TEST_ARGS)
 endif
 
-# Run the kernel binary using several combos of sizes and ranks.
+# Run the kernel binary using the test args defined above.
 yk-tests:
 	if (( $(first_test) <= 0 && $(last_test) >= 0 )); then $(YK_SCRIPT) $(test_args0); fi
 	if (( $(first_test) <= 1 && $(last_test) >= 1 )); then $(YK_SCRIPT) $(test_args1); fi
@@ -1057,15 +1064,19 @@ yk-mpi-tests:
 # Run the default YASK compiler and kernel.
 # First run on 1 rank, then multiple ranks if ranks>1.
 # This is the primary target for building and running stencil tests.
-yc-and-yk-test: $(YK_EXEC) $(YK_SCRIPT)
-	$(MAKE) ranks=1 yk-tests
-	if (( $(ranks) > 1 )); then $(MAKE) yk-tests yk-mpi-tests; fi
+yc-and-yk-test: $(YK_SCRIPT)
+	@ echo "Running tests that match regex '$(test_regex)' numbered from $(first_test) to $(last_test)..."
+	if [[ $(stencil) =~ $(test_regex) ]]; then \
+	  $(MAKE) $(YK_EXEC) && \
+	  $(MAKE) ranks=1 yk-tests && \
+	  if (( $(ranks) > 1 )); then $(MAKE) yk-tests yk-mpi-tests; fi; \
+	fi
 STENCIL_TEST	:=	$(TEST_MAKE) yc-and-yk-test
 
 # Run the YASK kernel test without implicity using the YASK compiler.
 yk-test-no-yc: kernel-no-yc $(YK_SCRIPT)
 	$(MAKE) ranks=1 yk-tests
-	if (( $(ranks) > 1 )); then $(MAKE) yk-tests yk-mpi-tests; fi
+	(( $(ranks) > 1 )) && $(MAKE) yk-tests yk-mpi-tests
 
 # Run the kernel API tests for C++ and Python with and w/o expected exceptions.
 api-tests:
@@ -1159,20 +1170,6 @@ single-stencil-tests:
 4d-tests:
 	$(MAKE) clean; $(STENCIL_TEST) stencil=test_4d $(call FOLD,w=2 x=2)
 
-# Selected collections from above for testing specific features.
-scratch-tests:
-	$(MAKE) clean; $(STENCIL_TEST) stencil=test_scratch_1d $(call FOLD,x=4)
-	$(MAKE) clean; $(STENCIL_TEST) stencil=test_scratch_stages_1d $(call FOLD,x=4)
-	$(MAKE) clean; $(STENCIL_TEST) stencil=test_scratch_boundary_1d $(call FOLD,x=4)
-	$(MAKE) clean; $(STENCIL_TEST) stencil=test_scratch_2d $(call FOLD,x=2 y=2)
-	$(MAKE) clean; $(STENCIL_TEST) stencil=test_scratch_3d $(call FOLD,x=2 z=2) inner_loop_dim=x
-
-boundary-tests:
-	$(MAKE) clean; $(STENCIL_TEST) stencil=test_boundary_1d $(call FOLD,x=4)
-	$(MAKE) clean; $(STENCIL_TEST) stencil=test_scratch_boundary_1d $(call FOLD,x=4)
-	$(MAKE) clean; $(STENCIL_TEST) stencil=test_boundary_2d $(call FOLD,x=2 y=2)
-	$(MAKE) clean; $(STENCIL_TEST) stencil=test_boundary_3d $(call FOLD,x=2 y=2) inner_loop_dim=1
-
 # The standard set of stencils to test.
 stencil-tests:
 	$(MAKE) 1d-tests
@@ -1184,6 +1181,16 @@ stencil-tests:
 	if (( $(offload) == 0 )); then $(MAKE) 3d-tests4; fi
 	$(MAKE) 4d-tests
 
+# Pre-defined feature tests.
+scratch-tests:
+	$(MAKE) stencil-tests test_regex=scratch
+
+boundary-tests:
+	$(MAKE) stencil-tests test_regex=boundary
+
+stages-tests:
+	$(MAKE) stencil-tests test_regex=stages
+
 unit-tests:
 	$(MAKE) clean; $(MAKE) cxx-yk-omp-test
 	$(MAKE) clean; $(MAKE) cxx-yk-var-test stencil=test_3d $(call FOLD,x=2 y=2)
@@ -1193,9 +1200,9 @@ all-tests:
 	$(MAKE) api-tests
 	$(MAKE) stencil-tests
 
-# Install the script.
+# Install the scripts.
 # Then, build and run all the tests.
 all:
-	$(MAKE) script
+	$(MAKE) scripts
 	$(MAKE) all-tests
 
diff --git a/src/kernel/lib/settings.cpp b/src/kernel/lib/settings.cpp
@@ -365,7 +365,8 @@ namespace yask {
         parser.add_option(make_shared<command_line_parser::bool_option>
                           ("allow_addl_padding",
                            "[Advanced] Allow automatic extension of padding"
-                           " beyond minimal vector alignment on any or all YASK vars.",
+                           " beyond minimal vector alignment on any or all YASK vars"
+                           " based on internal heuristics.",
                            _allow_addl_pad));
         #ifdef USE_MPI
         _add_domain_option(parser, "nr", "Num ranks", _num_ranks);
diff --git a/src/kernel/lib/settings.hpp b/src/kernel/lib/settings.hpp
@@ -264,7 +264,7 @@ namespace yask {
 
         // Var behavior, including allocation.
         bool _step_wrap = false; // Allow invalid step indices to alias to valid ones (set via APIs only).
-        bool _allow_addl_pad = true; // Allow extending padding beyond what's needed for alignment.
+        bool _allow_addl_pad = false;
         #ifdef USE_OFFLOAD
         bool _bundle_allocs = false;
         #else
diff --git a/src/kernel/lib/setup.cpp b/src/kernel/lib/setup.cpp
@@ -120,6 +120,11 @@ namespace yask {
         #ifdef USE_OFFLOAD
         _omp_hostn = omp_get_initial_device();
         _omp_devn = omp_get_default_device();
+
+        // Heuristic to assign GPU n to rank n on this node.
+        // Assumes shm is local to a node.
+        if (my_rank > 0 && omp_get_num_devices() > my_shm_rank)
+            _omp_devn = my_shm_rank;
         #endif
 
         #else
diff --git a/src/kernel/lib/soln_apis.cpp b/src/kernel/lib/soln_apis.cpp
@@ -151,8 +151,14 @@ namespace yask {
         reset_auto_tuner(actl_opts->_do_auto_tune, false);
 
         // Report ranks.
+        #ifdef USE_MPI
         DEBUG_MSG("\nNum MPI ranks:             " << env->get_num_ranks() <<
-                  "\nThis MPI rank index:       " << env->get_rank_index());
+                  "\nThis MPI rank index:       " << env->get_rank_index() <<
+                  "\nNum shm-group MPI ranks:   " << env->num_shm_ranks <<
+                  "\nThis shm-group MPI rank:   " << env->my_shm_rank);
+        #else
+        DEBUG_MSG("\nMPI not supported in this binary");
+        #endif
 
         // report threads.
         {
diff --git a/src/kernel/lib/stencil_calc.hpp b/src/kernel/lib/stencil_calc.hpp
@@ -690,7 +690,7 @@ namespace yask {
             // Full rectilinear polytope of aligned vecs.
             else {
                 TRACE_MSG("calculating vecs within "
-                          "normalized local indices " <<
+                          "*normalized* local indices " <<
                           norm_fvidxs.make_range_str(true) <<
                           " via outer thread " << outer_thread_idx <<
                           " and inner thread " << inner_thread_idx);
@@ -712,15 +712,16 @@ namespace yask {
                           sb_fvidxs.make_range_str(true) <<
                           " via outer thread " << outer_thread_idx <<
                           " and inner thread " << inner_thread_idx);
-                #if VPTS == 1
+                #if VLEN == 1
                 THROW_YASK_EXCEPTION("(internal fault) vector border-code not expected with vec-size==1");
                 #else
 
                 // Normalized vector indices.
                 auto norm_ovidxs = normalize_indices(sb_ovidxs);
 
-                // Need to find range in each border part.
-                // 2D example w/4 edges and 4 corners:
+                // Need to find range in each border part.  2D example w/4
+                // edges and 4 corners:
+                //
                 // +---+------+---+
                 // | lx|      |rx |
                 // | ly|  ly  |ly |
@@ -839,7 +840,7 @@ namespace yask {
                             if (pv_needed) {
                                 TRACE_MSG("calculating partial vectors with mask 0x" << 
                                           std::hex << pv_mask << std::dec << " for " << descr <<
-                                          " within normalized local indices " <<
+                                          " within *normalized* local indices " <<
                                           pv_part.make_range_str(true) <<
                                           " via outer thread " << outer_thread_idx <<
                                           " and inner thread " << inner_thread_idx);
diff --git a/src/kernel/lib/yk_var.cpp b/src/kernel/lib/yk_var.cpp
@@ -255,8 +255,10 @@ namespace yask {
             // Adjust padding only for domain dims.
             if (_domain_dim_mask & mbit) {
 
-                // Rounding should use soln vec lengths in case
-                // this var is not vectorized.
+                // Use soln vec len for rounding to allow reading a non-vec
+                // var in this dim while calculating a vec var. (The var
+                // vec-len is always 1 or the same as the soln vec-len in a
+                // given dim.)
                 auto svl = _corep->_soln_vec_lens[i];
 
                 // Add more padding requested by options or APIs.
@@ -265,31 +267,38 @@ namespace yask {
                 new_left_pads[i] = max(new_left_pads[i], _corep->_req_left_pads[i]);
                 new_right_pads[i] = max(new_right_pads[i], _corep->_req_right_pads[i]);
 
-                // Round left pad up to vec len.
+                // Round left pad up to soln vec len.
                 new_left_pads[i] = ROUND_UP(new_left_pads[i], svl);
 
-                // Round domain + right pad up to soln vec len by extending right pad.
-                // Using soln vec len to allow reading a non-vec var in this dim
-                // while calculating a vec var. (The var vec-len is always 1 or the same
-                // as the soln vec-len in a given dim.)
-                idx_t dprp = ROUND_UP(_corep->_domains[i] + new_right_pads[i], svl);
-
-                // Calculate pads from overall domain + right pad.
-                new_right_pads[i] = dprp - _corep->_domains[i];
-                
-                // Add yet another vec to both sides. This allows full-vector reads;
-                // only writes are masked.
+                // Sum of rounded-up domain and rounded right pad.
+                idx_t rdpp = ROUND_UP(_corep->_domains[i] + new_right_pads[i], svl);
+
+                // Subtract domain size back out to get desired right pad.
+                new_right_pads[i] = rdpp - _corep->_domains[i];
+
+                // When vec len > 1, add extra vecs to accommodate
+                // mis-alignment and extra calculations
+                //
+                // Example:
+                // ... +-------+-+           Last full vec and partial vec domain,
+                // ... +-------+-+---+       so minimal halo is within 1-vec pad.
+                // ... +-------+-------+     But full vecs actually calc'd,
+                // ... +-------+-------+---+      so halo reads are needed beyond that.
+                // ... +-------+-------+---+---+  Rounded up for alloc.
+                #if VLEN > 1
                 new_left_pads[i] += svl;
                 new_right_pads[i] += svl;
+                #endif
 
-                // Make inner dim an odd number of vecs.
+                // Make inner dim an odd number of vecs when allowed.
                 // This reportedly helps avoid some uarch aliasing.
-                auto na = new_left_pads[i] + _corep->_domains[i] + new_right_pads[i];
+                // Only add this optional vector if not already allocated.
                 if (!p &&
                     actl_opts->_allow_addl_pad &&
-                    get_dim_name(i) == inner_layout_dim &&
-                    (na / svl) % 2 == 0) {
-                    new_right_pads[i] += svl;
+                    get_dim_name(i) == inner_layout_dim) {
+                    auto na = new_left_pads[i] + _corep->_domains[i] + new_right_pads[i];
+                    if ((na / svl) % 2 == 0)
+                        new_right_pads[i] += svl;
                 }
 
                 // If storage is allocated, get max of existing pad & new
diff --git a/src/kernel/yask.sh b/src/kernel/yask.sh
@@ -172,7 +172,7 @@ while true; do
         echo "     Run YASK executable as an argument to <command>, e.g., 'numactl -N 0'."
         echo "  -mpi_cmd <command>"
         echo "     Run YASK executable as an argument to <command>, e.g., 'mpiexec.hydra -n 4'."
-        echo "     If -mpi_cmd and -exe_prefix are both specified, this one is used first."
+        echo "     If -mpi_cmd and -exe_prefix are both specified, this one is applied first."
         echo "     The default command is based on the number of nodes and ranks (see below)."
         echo "  -force_mpi"
         echo "     Generate a default 'mpirun' prefix even if there is only 1 rank to run."
@@ -186,7 +186,6 @@ while true; do
         echo "     This value, along with the number of nodes, <N>, is used to set these defaults:"
         echo "      - Number of MPI ranks per node to <R>/<N>."
         echo "      - Number of OpenMP threads per rank based on core count (for CPU kernels only)."
-        echo "      - Default MPI command to 'mpirun -np <R> -ppn <R>/<N>'."
         echo "     If a different MPI command is needed, use -mpi_cmd <command> explicitly."
         echo "     If the env var SLURM_NTASKS is set AND if it greater than the number of nodes,"
         echo "        the default is its value."
@@ -389,15 +388,22 @@ fi
 # Set MPI command default.
 ppn=$(( $nranks / $nnodes ))
 if [[ $nranks > 1 || $force_mpi == 1 ]]; then
-    : ${mpi_cmd="mpirun -np $nranks -ppn $ppn"}
 
-    # Add default Intel MPI settings.
-    envs+=" I_MPI_PRINT_VERSION=1 I_MPI_DEBUG=5"
+    if [[ $arch_offload =~ "nv" ]]; then
+        : ${mpi_cmd="mpirun -np $nranks --oversubscribe"}
 
-    # Add NUMA pinning if number of discovered NUMA nodes
-    # equals what is being used.
-    if [[ -n "$nnumas" && $nnumas == $ppn ]]; then
-        envs+=" I_MPI_PIN_DOMAIN=numa"
+    else
+        : ${mpi_cmd="mpirun -np $nranks -ppn $ppn"}
+
+        # Add default Intel MPI settings.
+        # These will be ignored if Intel MPI isn't used.
+        envs+=" I_MPI_PRINT_VERSION=1 I_MPI_DEBUG=5"
+
+        # Add NUMA pinning if number of discovered NUMA nodes
+        # equals what is being used.
+        if [[ -n "$nnumas" && $nnumas == $ppn ]]; then
+            envs+=" I_MPI_PIN_DOMAIN=numa"
+        fi
     fi
 
     # Check whether HBM policy setting is allowed.
diff --git a/utils/bin/yask_log_to_csv.pl b/utils/bin/yask_log_to_csv.pl
@@ -45,15 +45,20 @@
 
 # Header.
 YaskUtils::printCsvHeader($outFH);
-print $outFH ",log file\n";
+print $outFH ",date & time,log file\n";
 
 # Values from files.
 for my $arg (@ARGV) {
   for my $fn (glob $arg) {
     my %results;
     YaskUtils::getResultsFromFile(\%results, $fn);
 
+    my $datestr = "";
+    if ($fn =~ /(\d{4})-(\d{2})-(\d{2})_(\d{2})-(\d{2})-(\d{2})/) {
+      $datestr = "$2/$3/$1 $4:$5:$6"; # format for Excel.
+    }
+
     YaskUtils::printCsvValues(\%results, $outFH);
-    print $outFH ",\"$fn\"\n";
+    print $outFH ",\"$datestr\",\"$fn\"\n";
   }
 }