diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 974eff6f3..d790f853c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -64,7 +64,7 @@ jobs: build-dir: build options: ENABLE_WARNINGS_AS_ERRORS=Off - BLT_CXX_STD=c++17 + BLT_CXX_STD=c++20 CMAKE_BUILD_TYPE=Release PERFSUITE_RUN_SHORT_TEST=On ${{ matrix.shared.args }} diff --git a/CMakeLists.txt b/CMakeLists.txt index a995388aa..33fdd9e2b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,17 +17,17 @@ set(COMPILERS_KNOWN_TO_CMAKE33 AppleClang Clang GNU MSVC) include(CheckCXXCompilerFlag) if(NOT DEFINED BLT_CXX_STD) - if("cxx_std_20" IN_LIST CMAKE_CXX_KNOWN_FEATURES) - set(BLT_CXX_STD c++20 CACHE STRING "Version of C++ standard") + if("cxx_std_23" IN_LIST CMAKE_CXX_KNOWN_FEATURES) + set(BLT_CXX_STD c++23 CACHE STRING "Version of C++ standard") message("Using C++ standard: ${BLT_CXX_STD}") - elseif("cxx_std_17" IN_LIST CMAKE_CXX_KNOWN_FEATURES) - set(BLT_CXX_STD c++17 CACHE STRING "Version of C++ standard") + elseif("cxx_std_20" IN_LIST CMAKE_CXX_KNOWN_FEATURES) + set(BLT_CXX_STD c++20 CACHE STRING "Version of C++ standard") message("Using C++ standard: ${BLT_CXX_STD}") elseif("${CMAKE_CXX_COMPILER_ID}" IN_LIST COMPILERS_KNOWN_TO_CMAKE33) - set(BLT_CXX_STD c++17 CACHE STRING "Version of C++ standard") + set(BLT_CXX_STD c++20 CACHE STRING "Version of C++ standard") message("Using C++ standard: ${BLT_CXX_STD}") else() #cmake has no idea what to do, do it ourselves... - set(flag_var "c++17") + set(flag_var "c++20") CHECK_CXX_COMPILER_FLAG("-std=${flag_var}" COMPILER_SUPPORTS_${flag_var}) if(COMPILER_SUPPORTS_${flag_var}) set(BLT_CXX_STD ${flag_var} CACHE STRING "Version of C++ standard") @@ -38,8 +38,9 @@ if(NOT DEFINED BLT_CXX_STD) else() #check BLT_CXX_STD is high enough by disallowing the only invalid option if(("${BLT_CXX_STD}" STREQUAL "c++98") OR ("${BLT_CXX_STD}" STREQUAL "c++11") OR - ("${BLT_CXX_STD}" STREQUAL "c++14")) - message(FATAL_ERROR "RAJA requires minimum C++ standard of c++17") + ("${BLT_CXX_STD}" STREQUAL "c++14") OR + ("${BLT_CXX_STD}" STREQUAL "c++17")) + message(FATAL_ERROR "RAJA requires minimum C++ standard of c++20") endif() endif(NOT DEFINED BLT_CXX_STD) diff --git a/scripts/alcf-builds/sycl.sh b/scripts/alcf-builds/sycl.sh index 6988c505a..bcecaa44c 100755 --- a/scripts/alcf-builds/sycl.sh +++ b/scripts/alcf-builds/sycl.sh @@ -26,7 +26,7 @@ cmake \ -DENABLE_TARGET_OPENMP=Off \ -DENABLE_ALL_WARNINGS=Off \ -DENABLE_SYCL=On \ - -DCMAKE_CXX_STANDARD=17 \ + -DCMAKE_CXX_STANDARD=20 \ -DCMAKE_LINKER=icpx \ "$@" \ .. diff --git a/scripts/lc-builds/corona_sycl.sh b/scripts/lc-builds/corona_sycl.sh index a40ff1f91..4629860bc 100755 --- a/scripts/lc-builds/corona_sycl.sh +++ b/scripts/lc-builds/corona_sycl.sh @@ -67,7 +67,7 @@ cmake \ -DCMAKE_C_COMPILER=clang \ -DCMAKE_CXX_COMPILER=clang++ \ -DCMAKE_LINKER=clang++ \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -DENABLE_TESTS=On \ -DENABLE_EXAMPLES=On \ "$@" \ diff --git a/scripts/lc-builds/toss4_amdclang.sh b/scripts/lc-builds/toss4_amdclang.sh index 0ff0e04b9..7dd3af73a 100755 --- a/scripts/lc-builds/toss4_amdclang.sh +++ b/scripts/lc-builds/toss4_amdclang.sh @@ -95,7 +95,7 @@ cmake \ -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \ -DGPU_TARGETS="${COMP_ARCH}" \ -DAMDGPU_TARGETS="${COMP_ARCH}" \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_HIP=ON \ -DENABLE_OPENMP=ON \ diff --git a/scripts/lc-builds/toss4_amdclang_asan.sh b/scripts/lc-builds/toss4_amdclang_asan.sh index c525f0465..94582d0a5 100755 --- a/scripts/lc-builds/toss4_amdclang_asan.sh +++ b/scripts/lc-builds/toss4_amdclang_asan.sh @@ -102,7 +102,7 @@ cmake \ -DCMAKE_CXX_FLAGS="-fsanitize=address -fsanitize=undefined -shared-libsan" \ -DCMAKE_HIP_FLAGS="-fsanitize=address -fsanitize=undefined -shared-libsan -fgpu-rdc --hip-version=${COMP_HIP_VER}" \ -DCMAKE_EXE_LINKER_FLAGS="-L/opt/rocm-${COMP_HIP_VER}/lib/asan/ -L/opt/rocm-${COMP_HIP_VER}/llvm/lib/asan -Wl,-rpath,/opt/rocm-${COMP_HIP_VER}/lib/asan/:/opt/rocm-${COMP_HIP_VER}/llvm/lib/asan:/opt/rocm-${COMP_HIP_VER}/lib/llvm/lib/clang/${COMP_CLANG_MAJOR_VER}/lib/linux -fgpu-rdc --hip-version=${COMP_HIP_VER}" \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_HIP=ON \ -DENABLE_OPENMP=ON \ diff --git a/scripts/lc-builds/toss4_cce_hip.sh b/scripts/lc-builds/toss4_cce_hip.sh index e4b29132a..f09597805 100755 --- a/scripts/lc-builds/toss4_cce_hip.sh +++ b/scripts/lc-builds/toss4_cce_hip.sh @@ -69,7 +69,7 @@ cmake \ -DCMAKE_HIP_ARCHITECTURES=${HIP_ARCH} \ -DGPU_TARGETS=${HIP_ARCH} \ -DAMDGPU_TARGETS=${HIP_ARCH} \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_HIP=ON \ -DENABLE_OPENMP=ON \ diff --git a/scripts/lc-builds/toss4_clang-mpi_caliper.sh b/scripts/lc-builds/toss4_clang-mpi_caliper.sh index e19ad3e10..9d1e8c680 100755 --- a/scripts/lc-builds/toss4_clang-mpi_caliper.sh +++ b/scripts/lc-builds/toss4_clang-mpi_caliper.sh @@ -57,7 +57,7 @@ module load cmake/${CMAKE_VER} cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_VER}/bin/clang++ \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_MPI=ON \ -DENABLE_OPENMP=On \ diff --git a/scripts/lc-builds/toss4_clang.sh b/scripts/lc-builds/toss4_clang.sh index a97d23b1a..a3ae32ed4 100755 --- a/scripts/lc-builds/toss4_clang.sh +++ b/scripts/lc-builds/toss4_clang.sh @@ -51,7 +51,7 @@ module load cmake/${CMAKE_VER} cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_VER}/bin/clang++ \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ diff --git a/scripts/lc-builds/toss4_clang_caliper.sh b/scripts/lc-builds/toss4_clang_caliper.sh index 3c750f031..cb71ec540 100755 --- a/scripts/lc-builds/toss4_clang_caliper.sh +++ b/scripts/lc-builds/toss4_clang_caliper.sh @@ -57,7 +57,7 @@ module load cmake/${CMAKE_VER} cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_VER}/bin/clang++ \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ diff --git a/scripts/lc-builds/toss4_cray-mpich_amdclang.sh b/scripts/lc-builds/toss4_cray-mpich_amdclang.sh index c95551010..c8c981044 100755 --- a/scripts/lc-builds/toss4_cray-mpich_amdclang.sh +++ b/scripts/lc-builds/toss4_cray-mpich_amdclang.sh @@ -119,7 +119,7 @@ cmake \ -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \ -DGPU_TARGETS="${COMP_ARCH}" \ -DAMDGPU_TARGETS="${COMP_ARCH}" \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_MPI=ON \ -DENABLE_HIP=ON \ diff --git a/scripts/lc-builds/toss4_cray-mpich_amdclang_asan.sh b/scripts/lc-builds/toss4_cray-mpich_amdclang_asan.sh index ae3a3d0e7..d55dd2ade 100755 --- a/scripts/lc-builds/toss4_cray-mpich_amdclang_asan.sh +++ b/scripts/lc-builds/toss4_cray-mpich_amdclang_asan.sh @@ -126,7 +126,7 @@ cmake \ -DCMAKE_CXX_FLAGS="-fsanitize=address -shared-libsan" \ -DCMAKE_HIP_FLAGS="-fsanitize=address -shared-libsan -fgpu-rdc --hip-version=${COMP_HIP_VER}" \ -DCMAKE_EXE_LINKER_FLAGS="-L/opt/rocm-${COMP_HIP_VER}/lib/asan/ -L/opt/rocm-${COMP_HIP_VER}/llvm/lib/asan -Wl,-rpath,/opt/rocm-${COMP_HIP_VER}/lib/asan/:/opt/rocm-${COMP_HIP_VER}/llvm/lib/asan:/opt/rocm-${COMP_HIP_VER}/lib/llvm/lib/clang/${COMP_CLANG_MAJOR_VER}/lib/linux -fgpu-rdc --hip-version=${COMP_HIP_VER}" \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_MPI=ON \ -DENABLE_HIP=ON \ diff --git a/scripts/lc-builds/toss4_cray-mpich_amdclang_caliper.sh b/scripts/lc-builds/toss4_cray-mpich_amdclang_caliper.sh index 479da07c5..942cbb953 100755 --- a/scripts/lc-builds/toss4_cray-mpich_amdclang_caliper.sh +++ b/scripts/lc-builds/toss4_cray-mpich_amdclang_caliper.sh @@ -123,7 +123,7 @@ cmake \ -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \ -DGPU_TARGETS="${COMP_ARCH}" \ -DAMDGPU_TARGETS="${COMP_ARCH}" \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_MPI=ON \ -DENABLE_HIP=ON \ diff --git a/scripts/lc-builds/toss4_gcc-mpi_caliper.sh b/scripts/lc-builds/toss4_gcc-mpi_caliper.sh index a3b4ddf29..4fd0dc913 100755 --- a/scripts/lc-builds/toss4_gcc-mpi_caliper.sh +++ b/scripts/lc-builds/toss4_gcc-mpi_caliper.sh @@ -58,7 +58,7 @@ cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_C_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_VER}/bin/gcc \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_VER}/bin/g++ \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_MPI=ON \ -DENABLE_OPENMP=On \ diff --git a/scripts/lc-builds/toss4_gcc.sh b/scripts/lc-builds/toss4_gcc.sh index da1badd62..50f015c7b 100755 --- a/scripts/lc-builds/toss4_gcc.sh +++ b/scripts/lc-builds/toss4_gcc.sh @@ -51,7 +51,7 @@ module load cmake/${CMAKE_VER} cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_VER}/bin/g++ \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ diff --git a/scripts/lc-builds/toss4_gcc_caliper.sh b/scripts/lc-builds/toss4_gcc_caliper.sh index 9304c00d4..24e83dba2 100755 --- a/scripts/lc-builds/toss4_gcc_caliper.sh +++ b/scripts/lc-builds/toss4_gcc_caliper.sh @@ -57,7 +57,7 @@ module load cmake/${CMAKE_VER} cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_VER}/bin/g++ \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ diff --git a/scripts/lc-builds/toss4_hipcc.sh b/scripts/lc-builds/toss4_hipcc.sh index 52b1e0951..52357e46e 100755 --- a/scripts/lc-builds/toss4_hipcc.sh +++ b/scripts/lc-builds/toss4_hipcc.sh @@ -85,7 +85,7 @@ cmake \ -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \ -DGPU_TARGETS="${COMP_ARCH}" \ -DAMDGPU_TARGETS="${COMP_ARCH}" \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_HIP=ON \ -DENABLE_OPENMP=ON \ diff --git a/scripts/lc-builds/toss4_icpx.sh b/scripts/lc-builds/toss4_icpx.sh index b02276661..d885261a8 100755 --- a/scripts/lc-builds/toss4_icpx.sh +++ b/scripts/lc-builds/toss4_icpx.sh @@ -64,7 +64,7 @@ cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=icpx \ -DCMAKE_C_COMPILER=icx \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \ -DENABLE_OPENMP=On \ diff --git a/scripts/lc-builds/toss4_mvapich2_icpx.sh b/scripts/lc-builds/toss4_mvapich2_icpx.sh index 4a99f7a55..33363022d 100755 --- a/scripts/lc-builds/toss4_mvapich2_icpx.sh +++ b/scripts/lc-builds/toss4_mvapich2_icpx.sh @@ -65,7 +65,7 @@ cmake \ -DMPI_CXX_COMPILER="/usr/tce/packages/mvapich2/mvapich2-${MPI_VER}-intel-${COMP_VER}/bin/mpicxx" \ -DCMAKE_CXX_COMPILER=icpx \ -DCMAKE_C_COMPILER=icx \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_MPI=ON \ -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \ diff --git a/scripts/ubuntu-builds/ubuntu_amdclang.sh b/scripts/ubuntu-builds/ubuntu_amdclang.sh index 1ef233272..fd0f2db1b 100755 --- a/scripts/ubuntu-builds/ubuntu_amdclang.sh +++ b/scripts/ubuntu-builds/ubuntu_amdclang.sh @@ -62,7 +62,7 @@ cmake \ -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \ -DGPU_TARGETS="${COMP_ARCH}" \ -DAMDGPU_TARGETS="${COMP_ARCH}" \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_HIP=ON \ -DENABLE_OPENMP=ON \ diff --git a/src/algorithm/ATOMIC.cpp b/src/algorithm/ATOMIC.cpp index 7dadad386..490afaf9e 100644 --- a/src/algorithm/ATOMIC.cpp +++ b/src/algorithm/ATOMIC.cpp @@ -77,5 +77,53 @@ void ATOMIC::tearDown(VariantID RAJAPERF_UNUSED_ARG(vid), size_t RAJAPERF_UNUSED } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void ATOMIC::setCountedAttributes() +{ + const size_t replication = getActualProblemSize(); + + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + ATOMIC_DATA_SETUP(replication); + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(ATOMIC_BODY(RAJAPERF_ATOMIC_ADD_COUNTING, i, ATOMIC_VALUE)); + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + ATOMIC_DATA_TEARDOWN(replication); + ); + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace algorithm } // end namespace rajaperf diff --git a/src/algorithm/ATOMIC.hpp b/src/algorithm/ATOMIC.hpp index 6714a3097..11082fd35 100644 --- a/src/algorithm/ATOMIC.hpp +++ b/src/algorithm/ATOMIC.hpp @@ -70,6 +70,7 @@ class ATOMIC : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/algorithm/HISTOGRAM.cpp b/src/algorithm/HISTOGRAM.cpp index 0a501a95e..dc9123647 100644 --- a/src/algorithm/HISTOGRAM.cpp +++ b/src/algorithm/HISTOGRAM.cpp @@ -145,5 +145,63 @@ void HISTOGRAM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(DataSpace::Host, m_counts_final); } + +// // Only define setCountedAttributes functions past this point +// // BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void HISTOGRAM::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + HISTOGRAM_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + HISTOGRAM_SETUP_COUNTS; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + HISTOGRAM_INIT_COUNTS; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(HISTOGRAM_BODY(RAJAPERF_ATOMIC_ADD_COUNTING)); + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + HISTOGRAM_FINALIZE_COUNTS; + ); + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + HISTOGRAM_TEARDOWN_COUNTS; + ); + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, 0); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace algorithm } // end namespace rajaperf diff --git a/src/algorithm/HISTOGRAM.hpp b/src/algorithm/HISTOGRAM.hpp index 7f8d3121f..cb10e2620 100644 --- a/src/algorithm/HISTOGRAM.hpp +++ b/src/algorithm/HISTOGRAM.hpp @@ -92,6 +92,7 @@ class HISTOGRAM : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/algorithm/MEMCPY.cpp b/src/algorithm/MEMCPY.cpp index bf5b2f0d9..7df2de2e7 100644 --- a/src/algorithm/MEMCPY.cpp +++ b/src/algorithm/MEMCPY.cpp @@ -77,5 +77,47 @@ void MEMCPY::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void MEMCPY::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + MEMCPY_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(MEMCPY_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace algorithm } // end namespace rajaperf diff --git a/src/algorithm/MEMCPY.hpp b/src/algorithm/MEMCPY.hpp index 0737d1103..d33913468 100644 --- a/src/algorithm/MEMCPY.hpp +++ b/src/algorithm/MEMCPY.hpp @@ -50,6 +50,7 @@ class MEMCPY : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/algorithm/MEMSET.cpp b/src/algorithm/MEMSET.cpp index 10a08ac91..829656567 100644 --- a/src/algorithm/MEMSET.cpp +++ b/src/algorithm/MEMSET.cpp @@ -76,5 +76,47 @@ void MEMSET::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_x, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void MEMSET::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + MEMSET_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(MEMSET_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace algorithm } // end namespace rajaperf diff --git a/src/algorithm/MEMSET.hpp b/src/algorithm/MEMSET.hpp index 3573d21a2..a747dd7dc 100644 --- a/src/algorithm/MEMSET.hpp +++ b/src/algorithm/MEMSET.hpp @@ -50,6 +50,7 @@ class MEMSET : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/algorithm/REDUCE_SUM.cpp b/src/algorithm/REDUCE_SUM.cpp index 1155ad667..ce3d0f861 100644 --- a/src/algorithm/REDUCE_SUM.cpp +++ b/src/algorithm/REDUCE_SUM.cpp @@ -78,5 +78,55 @@ void REDUCE_SUM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_x, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void REDUCE_SUM::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + REDUCE_SUM_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + Real_type sum = m_sum_init; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(REDUCE_SUM_BODY); + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + m_sum = sum; + ); + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace algorithm } // end namespace rajaperf diff --git a/src/algorithm/REDUCE_SUM.hpp b/src/algorithm/REDUCE_SUM.hpp index 863ee54c7..7eb33a1b1 100644 --- a/src/algorithm/REDUCE_SUM.hpp +++ b/src/algorithm/REDUCE_SUM.hpp @@ -54,6 +54,7 @@ class REDUCE_SUM : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/algorithm/SCAN.cpp b/src/algorithm/SCAN.cpp index 22a462aa4..32e285654 100644 --- a/src/algorithm/SCAN.cpp +++ b/src/algorithm/SCAN.cpp @@ -76,5 +76,50 @@ void SCAN::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void SCAN::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + SCAN_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + SCAN_PROLOGUE; + ); + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(SCAN_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace algorithm } // end namespace rajaperf diff --git a/src/algorithm/SCAN.hpp b/src/algorithm/SCAN.hpp index c0f2ab28f..6db94218c 100644 --- a/src/algorithm/SCAN.hpp +++ b/src/algorithm/SCAN.hpp @@ -57,6 +57,7 @@ class SCAN : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt index b2b32cce2..75ecefb27 100644 --- a/src/apps/CMakeLists.txt +++ b/src/apps/CMakeLists.txt @@ -36,6 +36,7 @@ blt_add_library( EDGE3D-OMP.cpp EDGE3D-OMPTarget.cpp EDGE3D-Sycl.cpp + EDGE3D_COUNT.cpp ENERGY.cpp ENERGY-Seq.cpp ENERGY-Hip.cpp diff --git a/src/apps/CONVECTION3DPA.cpp b/src/apps/CONVECTION3DPA.cpp index 12f91e8d6..c6a7d08ff 100644 --- a/src/apps/CONVECTION3DPA.cpp +++ b/src/apps/CONVECTION3DPA.cpp @@ -106,5 +106,149 @@ void CONVECTION3DPA::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx deallocData(m_Y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void CONVECTION3DPA::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + CONVECTION3DPA_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type e = 0; e < NE; ++e)) { + RAJAPERF_COUNTERS_TEAM_CONTEXT(); + + RAJAPERF_COUNTERS_LOOP_BODY(CONVECTION3DPA_0_CPU); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dz,z,conv::D1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy,y,conv::D1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx,x,conv::D1D)) + { + RAJAPERF_COUNTERS_LOOP_BODY(CONVECTION3DPA_1); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dz,z,conv::D1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy,y,conv::D1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx,x,conv::Q1D)) + { + RAJAPERF_COUNTERS_LOOP_BODY(CONVECTION3DPA_2); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dz,z,conv::D1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx,x,conv::Q1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qy,y,conv::Q1D)) + { + RAJAPERF_COUNTERS_LOOP_BODY(CONVECTION3DPA_3); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx,x,conv::Q1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qy,y,conv::Q1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qz,z,conv::Q1D)) + { + RAJAPERF_COUNTERS_LOOP_BODY(CONVECTION3DPA_4); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qz,z,conv::Q1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qy,y,conv::Q1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx,x,conv::Q1D)) + { + RAJAPERF_COUNTERS_LOOP_BODY(CONVECTION3DPA_5); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx,x,conv::Q1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qy,y,conv::Q1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dz,z,conv::D1D)) + { + RAJAPERF_COUNTERS_LOOP_BODY(CONVECTION3DPA_6); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dz,z,conv::D1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx,x,conv::Q1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy,y,conv::D1D)) + { + RAJAPERF_COUNTERS_LOOP_BODY(CONVECTION3DPA_7); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dz,z,conv::D1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy,y,conv::D1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx,x,conv::D1D)) + { + RAJAPERF_COUNTERS_LOOP_BODY(CONVECTION3DPA_8); + } + } + } + + } // element loop + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/CONVECTION3DPA.hpp b/src/apps/CONVECTION3DPA.hpp index 13c26c00c..f23ef17af 100644 --- a/src/apps/CONVECTION3DPA.hpp +++ b/src/apps/CONVECTION3DPA.hpp @@ -233,50 +233,50 @@ constexpr RAJA::Index_type VDIM = 3; conv::VDIM * conv::Q1D * conv::Q1D * conv::Q1D * e] #define CONVECTION3DPA_0_GPU \ - constexpr Index_type max_D1D = conv::D1D; \ - constexpr Index_type max_Q1D = conv::Q1D; \ - constexpr Index_type max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D; \ - RAJA_TEAM_SHARED Real_type sm0[max_DQ * max_DQ * max_DQ]; \ - RAJA_TEAM_SHARED Real_type sm1[max_DQ * max_DQ * max_DQ]; \ - RAJA_TEAM_SHARED Real_type sm2[max_DQ * max_DQ * max_DQ]; \ - RAJA_TEAM_SHARED Real_type sm3[max_DQ * max_DQ * max_DQ]; \ - RAJA_TEAM_SHARED Real_type sm4[max_DQ * max_DQ * max_DQ]; \ - RAJA_TEAM_SHARED Real_type sm5[max_DQ * max_DQ * max_DQ]; \ - Real_type(*u)[max_D1D][max_D1D] = (Real_type(*)[max_D1D][max_D1D])sm0; \ - Real_type(*Bu)[max_D1D][max_Q1D] = (Real_type(*)[max_D1D][max_Q1D])sm1; \ - Real_type(*Gu)[max_D1D][max_Q1D] = (Real_type(*)[max_D1D][max_Q1D])sm2; \ - Real_type(*BBu)[max_Q1D][max_Q1D] = (Real_type(*)[max_Q1D][max_Q1D])sm3; \ - Real_type(*GBu)[max_Q1D][max_Q1D] = (Real_type(*)[max_Q1D][max_Q1D])sm4; \ - Real_type(*BGu)[max_Q1D][max_Q1D] = (Real_type(*)[max_Q1D][max_Q1D])sm5; \ - Real_type(*GBBu)[max_Q1D][max_Q1D] = (Real_type(*)[max_Q1D][max_Q1D])sm0; \ - Real_type(*BGBu)[max_Q1D][max_Q1D] = (Real_type(*)[max_Q1D][max_Q1D])sm1; \ - Real_type(*BBGu)[max_Q1D][max_Q1D] = (Real_type(*)[max_Q1D][max_Q1D])sm2; \ - Real_type(*DGu)[max_Q1D][max_Q1D] = (Real_type(*)[max_Q1D][max_Q1D])sm3; \ - Real_type(*BDGu)[max_Q1D][max_Q1D] = (Real_type(*)[max_Q1D][max_Q1D])sm4; \ - Real_type(*BBDGu)[max_D1D][max_Q1D] = (Real_type(*)[max_D1D][max_Q1D])sm5; + constexpr auto max_D1D = conv::D1D; \ + constexpr auto max_Q1D = conv::Q1D; \ + constexpr auto max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D; \ + RAJA_TEAM_SHARED Real_array3 sm0; \ + RAJA_TEAM_SHARED Real_array3 sm1; \ + RAJA_TEAM_SHARED Real_array3 sm2; \ + RAJA_TEAM_SHARED Real_array3 sm3; \ + RAJA_TEAM_SHARED Real_array3 sm4; \ + RAJA_TEAM_SHARED Real_array3 sm5; \ + Real_array3_ref u( sm0); \ + Real_array3_ref Bu(sm1); \ + Real_array3_ref Gu(sm2); \ + Real_array3_ref BBu(sm3); \ + Real_array3_ref GBu(sm4); \ + Real_array3_ref BGu(sm5); \ + Real_array3_ref GBBu(sm0); \ + Real_array3_ref BGBu(sm1); \ + Real_array3_ref BBGu(sm2); \ + Real_array3_ref DGu(sm3); \ + Real_array3_ref BDGu(sm4); \ + Real_array3_ref BBDGu(sm5); #define CONVECTION3DPA_0_CPU \ - constexpr Index_type max_D1D = conv::D1D; \ - constexpr Index_type max_Q1D = conv::Q1D; \ - constexpr Index_type max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D; \ - Real_type sm0[max_DQ * max_DQ * max_DQ]; \ - Real_type sm1[max_DQ * max_DQ * max_DQ]; \ - Real_type sm2[max_DQ * max_DQ * max_DQ]; \ - Real_type sm3[max_DQ * max_DQ * max_DQ]; \ - Real_type sm4[max_DQ * max_DQ * max_DQ]; \ - Real_type sm5[max_DQ * max_DQ * max_DQ]; \ - Real_type(*u)[max_D1D][max_D1D] = (Real_type(*)[max_D1D][max_D1D])sm0; \ - Real_type(*Bu)[max_D1D][max_Q1D] = (Real_type(*)[max_D1D][max_Q1D])sm1; \ - Real_type(*Gu)[max_D1D][max_Q1D] = (Real_type(*)[max_D1D][max_Q1D])sm2; \ - Real_type(*BBu)[max_Q1D][max_Q1D] = (Real_type(*)[max_Q1D][max_Q1D])sm3; \ - Real_type(*GBu)[max_Q1D][max_Q1D] = (Real_type(*)[max_Q1D][max_Q1D])sm4; \ - Real_type(*BGu)[max_Q1D][max_Q1D] = (Real_type(*)[max_Q1D][max_Q1D])sm5; \ - Real_type(*GBBu)[max_Q1D][max_Q1D] = (Real_type(*)[max_Q1D][max_Q1D])sm0; \ - Real_type(*BGBu)[max_Q1D][max_Q1D] = (Real_type(*)[max_Q1D][max_Q1D])sm1; \ - Real_type(*BBGu)[max_Q1D][max_Q1D] = (Real_type(*)[max_Q1D][max_Q1D])sm2; \ - Real_type(*DGu)[max_Q1D][max_Q1D] = (Real_type(*)[max_Q1D][max_Q1D])sm3; \ - Real_type(*BDGu)[max_Q1D][max_Q1D] = (Real_type(*)[max_Q1D][max_Q1D])sm4; \ - Real_type(*BBDGu)[max_D1D][max_Q1D] = (Real_type(*)[max_D1D][max_Q1D])sm5; + constexpr auto max_D1D = conv::D1D; \ + constexpr auto max_Q1D = conv::Q1D; \ + constexpr auto max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D; \ + Real_array3 sm0; \ + Real_array3 sm1; \ + Real_array3 sm2; \ + Real_array3 sm3; \ + Real_array3 sm4; \ + Real_array3 sm5; \ + Real_array3_ref u( sm0); \ + Real_array3_ref Bu(sm1); \ + Real_array3_ref Gu(sm2); \ + Real_array3_ref BBu(sm3); \ + Real_array3_ref GBu(sm4); \ + Real_array3_ref BGu(sm5); \ + Real_array3_ref GBBu(sm0); \ + Real_array3_ref BGBu(sm1); \ + Real_array3_ref BBGu(sm2); \ + Real_array3_ref DGu(sm3); \ + Real_array3_ref BDGu(sm4); \ + Real_array3_ref BBDGu(sm5); #define CONVECTION3DPA_1 u[dz][dy][dx] = CPA_X(dx, dy, dz, e); @@ -371,6 +371,7 @@ class CONVECTION3DPA : public KernelBase { void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/apps/DEL_DOT_VEC_2D.cpp b/src/apps/DEL_DOT_VEC_2D.cpp index dda43cc27..7a271c4b8 100644 --- a/src/apps/DEL_DOT_VEC_2D.cpp +++ b/src/apps/DEL_DOT_VEC_2D.cpp @@ -107,5 +107,48 @@ void DEL_DOT_VEC_2D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx deallocData(m_div, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void DEL_DOT_VEC_2D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + DEL_DOT_VEC_2D_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type ii = ibegin ; ii < iend ; ++ii )) { + RAJAPERF_COUNTERS_LOOP_BODY(DEL_DOT_VEC_2D_BODY_INDEX); + RAJAPERF_COUNTERS_LOOP_BODY(DEL_DOT_VEC_2D_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/DEL_DOT_VEC_2D.hpp b/src/apps/DEL_DOT_VEC_2D.hpp index 34490c1c2..9a856185f 100644 --- a/src/apps/DEL_DOT_VEC_2D.hpp +++ b/src/apps/DEL_DOT_VEC_2D.hpp @@ -114,6 +114,7 @@ class DEL_DOT_VEC_2D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/apps/DIFFUSION3DPA.cpp b/src/apps/DIFFUSION3DPA.cpp index 6af2aa341..e50502927 100644 --- a/src/apps/DIFFUSION3DPA.cpp +++ b/src/apps/DIFFUSION3DPA.cpp @@ -100,5 +100,129 @@ void DIFFUSION3DPA::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) deallocData(m_Y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void DIFFUSION3DPA::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + DIFFUSION3DPA_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type e = 0; e < NE; ++e)) { + RAJAPERF_COUNTERS_TEAM_CONTEXT(); + + RAJAPERF_COUNTERS_LOOP_BODY(DIFFUSION3DPA_0_CPU); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dz, z, diff::D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy, y, diff::D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx, x, diff::D1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(DIFFUSION3DPA_1); + } + } + } + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy, y, diff::D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx, x, diff::Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(DIFFUSION3DPA_2); + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dz, z, diff::D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy, y, diff::D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx, x, diff::Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(DIFFUSION3DPA_3); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dz, z, diff::D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qy, y, diff::Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx, x, diff::Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(DIFFUSION3DPA_4); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qz, z, diff::Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qy, y, diff::Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx, x, diff::Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(DIFFUSION3DPA_5); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy, y, diff::D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx, x, diff::Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(DIFFUSION3DPA_6); + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qz, z, diff::Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qy, y, diff::Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx, x, diff::D1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(DIFFUSION3DPA_7); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qz, z, diff::Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy, y, diff::D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx, x, diff::D1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(DIFFUSION3DPA_8); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dz, z, diff::D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy, y, diff::D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx, x, diff::D1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(DIFFUSION3DPA_9); + } + } + } + + } // element loop + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/DIFFUSION3DPA.hpp b/src/apps/DIFFUSION3DPA.hpp index a46f2fce6..74a93bdee 100644 --- a/src/apps/DIFFUSION3DPA.hpp +++ b/src/apps/DIFFUSION3DPA.hpp @@ -257,58 +257,58 @@ constexpr RAJA::Index_type DPA_SYM = 6; #define DPA_sign(q, d) (((q) <= (d)) ? -1.0 : 1.0) #define DIFFUSION3DPA_0_GPU \ - constexpr Index_type MQ1 = diff::Q1D; \ - constexpr Index_type MD1 = diff::D1D; \ - constexpr Index_type MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ - RAJA_TEAM_SHARED Real_type sBG[MQ1 * MD1]; \ - Real_type(*B)[MD1] = (Real_type(*)[MD1])sBG; \ - Real_type(*G)[MD1] = (Real_type(*)[MD1])sBG; \ - Real_type(*Bt)[MQ1] = (Real_type(*)[MQ1])sBG; \ - Real_type(*Gt)[MQ1] = (Real_type(*)[MQ1])sBG; \ - RAJA_TEAM_SHARED Real_type sm0[3][MDQ * MDQ * MDQ]; \ - RAJA_TEAM_SHARED Real_type sm1[3][MDQ * MDQ * MDQ]; \ - Real_type(*s_X)[MD1][MD1] = (Real_type(*)[MD1][MD1])(sm0 + 2); \ - Real_type(*DDQ0)[MD1][MQ1] = (Real_type(*)[MD1][MQ1])(sm0 + 0); \ - Real_type(*DDQ1)[MD1][MQ1] = (Real_type(*)[MD1][MQ1])(sm0 + 1); \ - Real_type(*DQQ0)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm1 + 0); \ - Real_type(*DQQ1)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm1 + 1); \ - Real_type(*DQQ2)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm1 + 2); \ - Real_type(*QQQ0)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm0 + 0); \ - Real_type(*QQQ1)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm0 + 1); \ - Real_type(*QQQ2)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm0 + 2); \ - Real_type(*QQD0)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])(sm1 + 0); \ - Real_type(*QQD1)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])(sm1 + 1); \ - Real_type(*QQD2)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])(sm1 + 2); \ - Real_type(*QDD0)[MD1][MD1] = (Real_type(*)[MD1][MD1])(sm0 + 0); \ - Real_type(*QDD1)[MD1][MD1] = (Real_type(*)[MD1][MD1])(sm0 + 1); \ - Real_type(*QDD2)[MD1][MD1] = (Real_type(*)[MD1][MD1])(sm0 + 2); + constexpr auto MQ1 = diff::Q1D; \ + constexpr auto MD1 = diff::D1D; \ + constexpr auto MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ + RAJA_TEAM_SHARED Real_array2 sBG; \ + Real_array2_ref B(sBG); \ + Real_array2_ref G(sBG); \ + Real_array2_ref Bt(sBG); \ + Real_array2_ref Gt(sBG); \ + RAJA_TEAM_SHARED Real_array4<3, MDQ, MDQ, MDQ> sm0; \ + RAJA_TEAM_SHARED Real_array4<3, MDQ, MDQ, MDQ> sm1; \ + Real_array3_ref s_X(sm0[2]); \ + Real_array3_ref DDQ0(sm0[0]); \ + Real_array3_ref DDQ1(sm0[1]); \ + Real_array3_ref DQQ0(sm1[0]); \ + Real_array3_ref DQQ1(sm1[1]); \ + Real_array3_ref DQQ2(sm1[2]); \ + Real_array3_ref QQQ0(sm0[0]); \ + Real_array3_ref QQQ1(sm0[1]); \ + Real_array3_ref QQQ2(sm0[2]); \ + Real_array3_ref QQD0(sm1[0]); \ + Real_array3_ref QQD1(sm1[1]); \ + Real_array3_ref QQD2(sm1[2]); \ + Real_array3_ref QDD0(sm0[0]); \ + Real_array3_ref QDD1(sm0[1]); \ + Real_array3_ref QDD2(sm0[2]); #define DIFFUSION3DPA_0_CPU \ - constexpr Index_type MQ1 = diff::Q1D; \ - constexpr Index_type MD1 = diff::D1D; \ - constexpr Index_type MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ - Real_type sBG[MQ1 * MD1]; \ - Real_type(*B)[MD1] = (Real_type(*)[MD1])sBG; \ - Real_type(*G)[MD1] = (Real_type(*)[MD1])sBG; \ - Real_type(*Bt)[MQ1] = (Real_type(*)[MQ1])sBG; \ - Real_type(*Gt)[MQ1] = (Real_type(*)[MQ1])sBG; \ - Real_type sm0[3][MDQ * MDQ * MDQ]; \ - Real_type sm1[3][MDQ * MDQ * MDQ]; \ - Real_type(*s_X)[MD1][MD1] = (Real_type(*)[MD1][MD1])(sm0 + 2); \ - Real_type(*DDQ0)[MD1][MQ1] = (Real_type(*)[MD1][MQ1])(sm0 + 0); \ - Real_type(*DDQ1)[MD1][MQ1] = (Real_type(*)[MD1][MQ1])(sm0 + 1); \ - Real_type(*DQQ0)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm1 + 0); \ - Real_type(*DQQ1)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm1 + 1); \ - Real_type(*DQQ2)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm1 + 2); \ - Real_type(*QQQ0)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm0 + 0); \ - Real_type(*QQQ1)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm0 + 1); \ - Real_type(*QQQ2)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm0 + 2); \ - Real_type(*QQD0)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])(sm1 + 0); \ - Real_type(*QQD1)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])(sm1 + 1); \ - Real_type(*QQD2)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])(sm1 + 2); \ - Real_type(*QDD0)[MD1][MD1] = (Real_type(*)[MD1][MD1])(sm0 + 0); \ - Real_type(*QDD1)[MD1][MD1] = (Real_type(*)[MD1][MD1])(sm0 + 1); \ - Real_type(*QDD2)[MD1][MD1] = (Real_type(*)[MD1][MD1])(sm0 + 2); + constexpr auto MQ1 = diff::Q1D; \ + constexpr auto MD1 = diff::D1D; \ + constexpr auto MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ + Real_array2 sBG; \ + Real_array2_ref B(sBG); \ + Real_array2_ref G(sBG); \ + Real_array2_ref Bt(sBG); \ + Real_array2_ref Gt(sBG); \ + Real_array4<3, MDQ, MDQ, MDQ> sm0; \ + Real_array4<3, MDQ, MDQ, MDQ> sm1; \ + Real_array3_ref s_X(sm0[2]); \ + Real_array3_ref DDQ0(sm0[0]); \ + Real_array3_ref DDQ1(sm0[1]); \ + Real_array3_ref DQQ0(sm1[0]); \ + Real_array3_ref DQQ1(sm1[1]); \ + Real_array3_ref DQQ2(sm1[2]); \ + Real_array3_ref QQQ0(sm0[0]); \ + Real_array3_ref QQQ1(sm0[1]); \ + Real_array3_ref QQQ2(sm0[2]); \ + Real_array3_ref QQD0(sm1[0]); \ + Real_array3_ref QQD1(sm1[1]); \ + Real_array3_ref QQD2(sm1[2]); \ + Real_array3_ref QDD0(sm0[0]); \ + Real_array3_ref QDD1(sm0[1]); \ + Real_array3_ref QDD2(sm0[2]); #define DIFFUSION3DPA_1 s_X[dz][dy][dx] = DPA_X(dx, dy, dz, e); @@ -419,6 +419,7 @@ class DIFFUSION3DPA : public KernelBase { void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/apps/EDGE3D-Cuda.cpp b/src/apps/EDGE3D-Cuda.cpp index 2c413607d..1efb96db5 100644 --- a/src/apps/EDGE3D-Cuda.cpp +++ b/src/apps/EDGE3D-Cuda.cpp @@ -13,6 +13,8 @@ #if defined(RAJA_ENABLE_CUDA) +#include "EDGE3D_HELPER.hpp" + #include "common/CudaDataUtils.hpp" #include "AppsData.hpp" diff --git a/src/apps/EDGE3D-Hip.cpp b/src/apps/EDGE3D-Hip.cpp index e9a0dae7c..c53eafabe 100644 --- a/src/apps/EDGE3D-Hip.cpp +++ b/src/apps/EDGE3D-Hip.cpp @@ -13,6 +13,8 @@ #if defined(RAJA_ENABLE_HIP) +#include "EDGE3D_HELPER.hpp" + #include "common/HipDataUtils.hpp" #include "AppsData.hpp" diff --git a/src/apps/EDGE3D-OMP.cpp b/src/apps/EDGE3D-OMP.cpp index 14001ff5b..28a052505 100644 --- a/src/apps/EDGE3D-OMP.cpp +++ b/src/apps/EDGE3D-OMP.cpp @@ -11,6 +11,8 @@ #include "RAJA/RAJA.hpp" +#include "EDGE3D_HELPER.hpp" + #include "AppsData.hpp" #include diff --git a/src/apps/EDGE3D-OMPTarget.cpp b/src/apps/EDGE3D-OMPTarget.cpp index a5d2bdba5..c40242141 100644 --- a/src/apps/EDGE3D-OMPTarget.cpp +++ b/src/apps/EDGE3D-OMPTarget.cpp @@ -13,6 +13,8 @@ #if defined(RAJA_ENABLE_TARGET_OPENMP) +#include "EDGE3D_HELPER.hpp" + #include "common/OpenMPTargetDataUtils.hpp" #include "AppsData.hpp" diff --git a/src/apps/EDGE3D-Seq.cpp b/src/apps/EDGE3D-Seq.cpp index 8f927fe58..a7b42fb55 100644 --- a/src/apps/EDGE3D-Seq.cpp +++ b/src/apps/EDGE3D-Seq.cpp @@ -11,6 +11,8 @@ #include "RAJA/RAJA.hpp" +#include "EDGE3D_HELPER.hpp" + #include "AppsData.hpp" #include diff --git a/src/apps/EDGE3D-Sycl.cpp b/src/apps/EDGE3D-Sycl.cpp index 34fab5bc6..1b6b96b4a 100644 --- a/src/apps/EDGE3D-Sycl.cpp +++ b/src/apps/EDGE3D-Sycl.cpp @@ -13,6 +13,8 @@ #if defined(RAJA_ENABLE_SYCL) +#include "EDGE3D_HELPER.hpp" + #include "common/SyclDataUtils.hpp" #include "AppsData.hpp" diff --git a/src/apps/EDGE3D.cpp b/src/apps/EDGE3D.cpp index b544c4c91..90f1a031d 100644 --- a/src/apps/EDGE3D.cpp +++ b/src/apps/EDGE3D.cpp @@ -16,6 +16,7 @@ #include +#include "EDGE3D_HELPER.hpp" namespace rajaperf { @@ -67,22 +68,22 @@ void EDGE3D::setSize(Index_type target_size, Index_type target_reps) setBytesModifyWrittenPerRep( 0 ); setBytesAtomicModifyWrittenPerRep( 0 ); - constexpr size_t flops_k_loop = 15 - + 6*flops_Jxx() - + flops_jacobian_inv() - + flops_transform_basis(EB) // flops for transform_edge_basis() - + flops_transform_basis(EB) + 9 // flops for transform_curl_edge_basis() - + 2*flops_inner_product<12, 12>(true); + const size_t flops_k_loop = 15 + + 6*flops_Jxx() + + flops_jacobian_inv() + + flops_transform_basis(EB) // flops for transform_edge_basis() + + flops_transform_basis(EB) + 9 // flops for transform_curl_edge_basis() + + 2*flops_inner_product<12, 12>(true); - constexpr size_t flops_j_loop = flops_k_loop*NQ_1D + 3*flops_Jxx() + 6; - constexpr size_t flops_i_loop = flops_j_loop*NQ_1D + 1; + const size_t flops_j_loop = flops_k_loop*NQ_1D + 3*flops_Jxx() + 6; + const size_t flops_i_loop = flops_j_loop*NQ_1D + 1; - constexpr size_t flops_edge_MpSmatrix = 9*flops_Jxx() - + flops_compute_detj() - + flops_i_loop*NQ_1D; + const size_t flops_edge_MpSmatrix = 9*flops_Jxx() + + flops_compute_detj() + + flops_i_loop*NQ_1D; - constexpr size_t flops_per_element = flops_edge_MpSmatrix - + (EB*EB + EB); // sum + const size_t flops_per_element = flops_edge_MpSmatrix + + (EB*EB + EB); // sum setFLOPsPerRep(number_of_elements * flops_per_element); } diff --git a/src/apps/EDGE3D.hpp b/src/apps/EDGE3D.hpp index df19ff7f5..9a917f02c 100644 --- a/src/apps/EDGE3D.hpp +++ b/src/apps/EDGE3D.hpp @@ -189,178 +189,12 @@ #ifndef RAJAPerf_Apps_EDGE3D_HPP #define RAJAPerf_Apps_EDGE3D_HPP +#define NB 8 +#define EB 12 +#define FB 6 +#define MAX_QUAD_ORDER 5 #define NQ_1D 2 -#include "mixed_fem_helper.hpp" - -RAJA_HOST_DEVICE -RAJA_INLINE void edge_MpSmatrix( - const rajaperf::Real_type (&x)[NB], - const rajaperf::Real_type (&y)[NB], - const rajaperf::Real_type (&z)[NB], - rajaperf::Real_type alpha, - rajaperf::Real_type beta, - const rajaperf::Real_type detj_tol, - const rajaperf::Int_type quad_type, - const rajaperf::Int_type quad_order, - rajaperf::Real_type (&matrix)[EB][EB]) -{ - // Get integration points and weights - rajaperf::Real_type qpts_1d[MAX_QUAD_ORDER]; - rajaperf::Real_type wgts_1d[MAX_QUAD_ORDER]; - - get_quadrature_rule(quad_type, quad_order, qpts_1d, wgts_1d); - - // Compute cell centered Jacobian - const rajaperf::Real_type jxx_cc = Jxx(x, y, z, 0.25, 0.25, 0.25, 0.25); - const rajaperf::Real_type jxy_cc = Jxy(x, y, z, 0.25, 0.25, 0.25, 0.25); - const rajaperf::Real_type jxz_cc = Jxz(x, y, z, 0.25, 0.25, 0.25, 0.25); - const rajaperf::Real_type jyx_cc = Jyx(x, y, z, 0.25, 0.25, 0.25, 0.25); - const rajaperf::Real_type jyy_cc = Jyy(x, y, z, 0.25, 0.25, 0.25, 0.25); - const rajaperf::Real_type jyz_cc = Jyz(x, y, z, 0.25, 0.25, 0.25, 0.25); - const rajaperf::Real_type jzx_cc = Jzx(x, y, z, 0.25, 0.25, 0.25, 0.25); - const rajaperf::Real_type jzy_cc = Jzy(x, y, z, 0.25, 0.25, 0.25, 0.25); - const rajaperf::Real_type jzz_cc = Jzz(x, y, z, 0.25, 0.25, 0.25, 0.25); - - // Compute cell centered Jacobian determinant - const rajaperf::Real_type detj_cc = compute_detj( - jxx_cc, jxy_cc, jxz_cc, - jyx_cc, jyy_cc, jyz_cc, - jzx_cc, jzy_cc, jzz_cc); - - // Initialize the stiffness matrix - for (rajaperf::Int_type m = 0; m < EB; m++) { - for (rajaperf::Int_type p = m; p < EB; p++) { - matrix[m][p] = 0.0; - } - } - - // Compute values at each quadrature point - for ( rajaperf::Int_type i = 0; i < quad_order; i++ ) { - - const rajaperf::Real_type xloc = qpts_1d[i]; - const rajaperf::Real_type tmpx = 1. - xloc; - - rajaperf::Real_type dbasisx[EB] = {0}; - curl_edgebasis_x(dbasisx, tmpx, xloc); - - for ( rajaperf::Int_type j = 0; j < quad_order; j++ ) { - - const rajaperf::Real_type yloc = qpts_1d[j]; - const rajaperf::Real_type wgtxy = wgts_1d[i]*wgts_1d[j]; - const rajaperf::Real_type tmpy = 1. - yloc; - - rajaperf::Real_type tmpxy = tmpx*tmpy; - rajaperf::Real_type xyloc = xloc*yloc; - rajaperf::Real_type tmpxyloc = tmpx*yloc; - rajaperf::Real_type xloctmpy = xloc*tmpy; - - const rajaperf::Real_type jzx = Jzx(x, y, z, tmpxy, xloctmpy, xyloc, tmpxyloc); - const rajaperf::Real_type jzy = Jzy(x, y, z, tmpxy, xloctmpy, xyloc, tmpxyloc); - const rajaperf::Real_type jzz = Jzz(x, y, z, tmpxy, xloctmpy, xyloc, tmpxyloc); - - rajaperf::Real_type ebasisz[EB] = {0}; - edgebasis_z(ebasisz, tmpxy, xloctmpy, xyloc, tmpxyloc); - - rajaperf::Real_type dbasisy[EB] = {0}; - curl_edgebasis_y(dbasisy, tmpy, yloc); - - // Differeniate basis with respect to z at this quadrature point - - for ( rajaperf::Int_type k = 0; k < quad_order; k++ ) { - - const rajaperf::Real_type zloc = qpts_1d[k]; - const rajaperf::Real_type wgts = wgtxy*wgts_1d[k]; - const rajaperf::Real_type tmpz = 1. - zloc; - - const rajaperf::Real_type tmpxz = tmpx*tmpz; - const rajaperf::Real_type tmpyz = tmpy*tmpz; - - const rajaperf::Real_type xzloc = xloc*zloc; - const rajaperf::Real_type yzloc = yloc*zloc; - - const rajaperf::Real_type tmpyzloc = tmpy*zloc; - const rajaperf::Real_type tmpxzloc = tmpx*zloc; - - const rajaperf::Real_type yloctmpz = yloc*tmpz; - const rajaperf::Real_type xloctmpz = xloc*tmpz; - - const rajaperf::Real_type jxx = Jxx(x, y, z, tmpyz, yloctmpz, tmpyzloc, yzloc); - const rajaperf::Real_type jxy = Jxy(x, y, z, tmpyz, yloctmpz, tmpyzloc, yzloc); - const rajaperf::Real_type jxz = Jxz(x, y, z, tmpyz, yloctmpz, tmpyzloc, yzloc); - const rajaperf::Real_type jyx = Jyx(x, y, z, tmpxz, xloctmpz, tmpxzloc, xzloc); - const rajaperf::Real_type jyy = Jyy(x, y, z, tmpxz, xloctmpz, tmpxzloc, xzloc); - const rajaperf::Real_type jyz = Jyz(x, y, z, tmpxz, xloctmpz, tmpxzloc, xzloc); - - rajaperf::Real_type jinvxx, jinvxy, jinvxz, - jinvyx, jinvyy, jinvyz, - jinvzx, jinvzy, jinvzz, - detj_unfixed, detj, abs_detj, invdetj; - - jacobian_inv( - jxx, jxy, jxz, - jyx, jyy, jyz, - jzx, jzy, jzz, - detj_cc, detj_tol, - jinvxx, jinvxy, jinvxz, - jinvyx, jinvyy, jinvyz, - jinvzx, jinvzy, jinvzz, - detj_unfixed, detj, abs_detj, invdetj); - - const rajaperf::Real_type detjwgts = wgts*abs_detj; - - rajaperf::Real_type ebasisx[EB] = {0}; - edgebasis_x(ebasisx, tmpyz, yloctmpz, tmpyzloc, yzloc); - - rajaperf::Real_type ebasisy[EB] = {0}; - edgebasis_y(ebasisy, tmpxz, xloctmpz, tmpxzloc, xzloc); - - rajaperf::Real_type dbasisz[EB] = {0}; - curl_edgebasis_z(dbasisz, tmpz, zloc); - - const rajaperf::Real_type inv_abs_detj = 1./(abs_detj+ptiny); - - rajaperf::Real_type tebasisx[EB] = {0}; - rajaperf::Real_type tebasisy[EB] = {0}; - rajaperf::Real_type tebasisz[EB] = {0}; - - transform_edge_basis( - jinvxx, jinvxy, jinvxz, - jinvyx, jinvyy, jinvyz, - jinvzx, jinvzy, jinvzz, - ebasisx, ebasisy, ebasisz, - tebasisx, tebasisy, tebasisz); - - rajaperf::Real_type tdbasisx[EB] = {0}; - rajaperf::Real_type tdbasisy[EB] = {0}; - rajaperf::Real_type tdbasisz[EB] = {0}; - - transform_curl_edge_basis( - jxx, jxy, jxz, - jyx, jyy, jyz, - jzx, jzy, jzz, - inv_abs_detj, - dbasisx, dbasisy, dbasisz, - tdbasisx, tdbasisy, tdbasisz); - - // the inner product: alpha* - inner_product( - detjwgts*alpha, - tebasisx, tebasisy, tebasisz, - tebasisx, tebasisy, tebasisz, - matrix, true); - - // the inner product: beta* - inner_product( - detjwgts*beta, - tdbasisx, tdbasisy, tdbasisz, - tdbasisx, tdbasisy, tdbasisz, - matrix, true); - } - } - } -} - #define EDGE3D_DATA_SETUP \ Real_ptr x = m_x; \ Real_ptr y = m_y; \ @@ -376,15 +210,18 @@ RAJA_INLINE void edge_MpSmatrix( NDPTRSET(m_domain->jp, m_domain->kp, z,z0,z1,z2,z3,z4,z5,z6,z7) ; #define EDGE3D_BODY \ - rajaperf::Real_type X[NB] = {x0[i],x1[i],x2[i],x3[i],x4[i],x5[i],x6[i],x7[i]};\ - rajaperf::Real_type Y[NB] = {y0[i],y1[i],y2[i],y3[i],y4[i],y5[i],y6[i],y7[i]};\ - rajaperf::Real_type Z[NB] = {z0[i],z1[i],z2[i],z3[i],z4[i],z5[i],z6[i],z7[i]};\ - rajaperf::Real_type edge_matrix[EB][EB];\ + Real_array X;\ + Real_array Y;\ + Real_array Z;\ + Real_array2 edge_matrix;\ + X[0]=x0[i]; X[1]=x1[i]; X[2]=x2[i]; X[3]=x3[i]; X[4]=x4[i]; X[5]=x5[i]; X[6]=x6[i]; X[7]=x7[i];\ + Y[0]=y0[i]; Y[1]=y1[i]; Y[2]=y2[i]; Y[3]=y3[i]; Y[4]=y4[i]; Y[5]=y5[i]; Y[6]=y6[i]; Y[7]=y7[i];\ + Z[0]=z0[i]; Z[1]=z1[i]; Z[2]=z2[i]; Z[3]=z3[i]; Z[4]=z4[i]; Z[5]=z5[i]; Z[6]=z6[i]; Z[7]=z7[i];\ edge_MpSmatrix(X, Y, Z, 1.0, 1.0, 0.0, 1.0, NQ_1D, edge_matrix);\ - rajaperf::Real_type local_sum = 0.0;\ - for (rajaperf::Int_type m = 0; m < EB; m++) {\ - rajaperf::Real_type check = 0.0;\ - for (rajaperf::Int_type p = 0; p < EB; p++) {\ + Real_type local_sum = 0.0;\ + for (Int_type m = 0; m < EB; m++) {\ + Real_type check = 0.0;\ + for (Int_type p = 0; p < EB; p++) {\ check += edge_matrix[m][p];\ }\ local_sum += check;\ @@ -413,6 +250,7 @@ class EDGE3D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/apps/EDGE3D_COUNT.cpp b/src/apps/EDGE3D_COUNT.cpp new file mode 100644 index 000000000..91f5e8906 --- /dev/null +++ b/src/apps/EDGE3D_COUNT.cpp @@ -0,0 +1,69 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) Lawrence Livermore National Security, LLC and other +// RAJA Project Developers. See top-level LICENSE and COPYRIGHT +// files for dates and other details. No copyright assignment is required +// to contribute to RAJA Performance Suite. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "EDGE3D.hpp" + +#include "RAJA/RAJA.hpp" + +#include "AppsData.hpp" +#include "common/DataUtils.hpp" + +#include + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +// This shouldn't result in ODR violations as the argument types have changed +#include "EDGE3D_HELPER.hpp" + +namespace rajaperf +{ +namespace apps +{ + +void EDGE3D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = m_domain->fpz; + const Index_type iend = m_domain->lpz+1; + + EDGE3D_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(EDGE3D_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps/EDGE3D_HELPER.hpp b/src/apps/EDGE3D_HELPER.hpp new file mode 100644 index 000000000..89dea5ccf --- /dev/null +++ b/src/apps/EDGE3D_HELPER.hpp @@ -0,0 +1,188 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) Lawrence Livermore National Security, LLC and other +// RAJA Project Developers. See top-level LICENSE and COPYRIGHT +// files for dates and other details. No copyright assignment is required +// to contribute to RAJA Performance Suite. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJAPerf_Apps_EDGE3D_HELPER_HPP +#define RAJAPerf_Apps_EDGE3D_HELPER_HPP + +#include "mixed_fem_helper.hpp" + +namespace rajaperf +{ + +RAJA_HOST_DEVICE +RAJA_INLINE void edge_MpSmatrix( + Real_array_const_ref x, + Real_array_const_ref y, + Real_array_const_ref z, + Real_type alpha, + Real_type beta, + const Real_type detj_tol, + const Int_type quad_type, + const Int_type quad_order, + Real_array2_ref matrix) +{ + // Get integration points and weights + Real_array qpts_1d; + Real_array wgts_1d; + + get_quadrature_rule(quad_type, quad_order, qpts_1d, wgts_1d); + + // Compute cell centered Jacobian + const Real_type jxx_cc = Jxx(x, y, z, 0.25, 0.25, 0.25, 0.25); + const Real_type jxy_cc = Jxy(x, y, z, 0.25, 0.25, 0.25, 0.25); + const Real_type jxz_cc = Jxz(x, y, z, 0.25, 0.25, 0.25, 0.25); + const Real_type jyx_cc = Jyx(x, y, z, 0.25, 0.25, 0.25, 0.25); + const Real_type jyy_cc = Jyy(x, y, z, 0.25, 0.25, 0.25, 0.25); + const Real_type jyz_cc = Jyz(x, y, z, 0.25, 0.25, 0.25, 0.25); + const Real_type jzx_cc = Jzx(x, y, z, 0.25, 0.25, 0.25, 0.25); + const Real_type jzy_cc = Jzy(x, y, z, 0.25, 0.25, 0.25, 0.25); + const Real_type jzz_cc = Jzz(x, y, z, 0.25, 0.25, 0.25, 0.25); + + // Compute cell centered Jacobian determinant + const Real_type detj_cc = compute_detj( + jxx_cc, jxy_cc, jxz_cc, + jyx_cc, jyy_cc, jyz_cc, + jzx_cc, jzy_cc, jzz_cc); + + // Initialize the stiffness matrix + for (Int_type m = 0; m < EB; m++) { + for (Int_type p = m; p < EB; p++) { + matrix[m][p] = 0.0; + } + } + + // Compute values at each quadrature point + for ( Int_type i = 0; i < quad_order; i++ ) { + + const Real_type xloc = qpts_1d[i]; + const Real_type tmpx = 1. - xloc; + + Real_array dbasisx = {}; + curl_edgebasis_x(dbasisx, tmpx, xloc); + + for ( Int_type j = 0; j < quad_order; j++ ) { + + const Real_type yloc = qpts_1d[j]; + const Real_type wgtxy = wgts_1d[i]*wgts_1d[j]; + const Real_type tmpy = 1. - yloc; + + Real_type tmpxy = tmpx*tmpy; + Real_type xyloc = xloc*yloc; + Real_type tmpxyloc = tmpx*yloc; + Real_type xloctmpy = xloc*tmpy; + + const Real_type jzx = Jzx(x, y, z, tmpxy, xloctmpy, xyloc, tmpxyloc); + const Real_type jzy = Jzy(x, y, z, tmpxy, xloctmpy, xyloc, tmpxyloc); + const Real_type jzz = Jzz(x, y, z, tmpxy, xloctmpy, xyloc, tmpxyloc); + + Real_array ebasisz = {}; + edgebasis_z(ebasisz, tmpxy, xloctmpy, xyloc, tmpxyloc); + + Real_array dbasisy = {}; + curl_edgebasis_y(dbasisy, tmpy, yloc); + + // Differeniate basis with respect to z at this quadrature point + + for ( Int_type k = 0; k < quad_order; k++ ) { + + const Real_type zloc = qpts_1d[k]; + const Real_type wgts = wgtxy*wgts_1d[k]; + const Real_type tmpz = 1. - zloc; + + const Real_type tmpxz = tmpx*tmpz; + const Real_type tmpyz = tmpy*tmpz; + + const Real_type xzloc = xloc*zloc; + const Real_type yzloc = yloc*zloc; + + const Real_type tmpyzloc = tmpy*zloc; + const Real_type tmpxzloc = tmpx*zloc; + + const Real_type yloctmpz = yloc*tmpz; + const Real_type xloctmpz = xloc*tmpz; + + const Real_type jxx = Jxx(x, y, z, tmpyz, yloctmpz, tmpyzloc, yzloc); + const Real_type jxy = Jxy(x, y, z, tmpyz, yloctmpz, tmpyzloc, yzloc); + const Real_type jxz = Jxz(x, y, z, tmpyz, yloctmpz, tmpyzloc, yzloc); + const Real_type jyx = Jyx(x, y, z, tmpxz, xloctmpz, tmpxzloc, xzloc); + const Real_type jyy = Jyy(x, y, z, tmpxz, xloctmpz, tmpxzloc, xzloc); + const Real_type jyz = Jyz(x, y, z, tmpxz, xloctmpz, tmpxzloc, xzloc); + + Real_type jinvxx, jinvxy, jinvxz, + jinvyx, jinvyy, jinvyz, + jinvzx, jinvzy, jinvzz, + detj_unfixed, detj, abs_detj, invdetj; + + jacobian_inv( + jxx, jxy, jxz, + jyx, jyy, jyz, + jzx, jzy, jzz, + detj_cc, detj_tol, + jinvxx, jinvxy, jinvxz, + jinvyx, jinvyy, jinvyz, + jinvzx, jinvzy, jinvzz, + detj_unfixed, detj, abs_detj, invdetj); + + const Real_type detjwgts = wgts*abs_detj; + + Real_array ebasisx = {}; + edgebasis_x(ebasisx, tmpyz, yloctmpz, tmpyzloc, yzloc); + + Real_array ebasisy = {}; + edgebasis_y(ebasisy, tmpxz, xloctmpz, tmpxzloc, xzloc); + + Real_array dbasisz = {}; + curl_edgebasis_z(dbasisz, tmpz, zloc); + + const Real_type inv_abs_detj = 1./(abs_detj+ptiny); + + Real_array tebasisx = {}; + Real_array tebasisy = {}; + Real_array tebasisz = {}; + + transform_edge_basis( + jinvxx, jinvxy, jinvxz, + jinvyx, jinvyy, jinvyz, + jinvzx, jinvzy, jinvzz, + ebasisx, ebasisy, ebasisz, + tebasisx, tebasisy, tebasisz); + + Real_array tdbasisx = {}; + Real_array tdbasisy = {}; + Real_array tdbasisz = {}; + + transform_curl_edge_basis( + jxx, jxy, jxz, + jyx, jyy, jyz, + jzx, jzy, jzz, + inv_abs_detj, + dbasisx, dbasisy, dbasisz, + tdbasisx, tdbasisy, tdbasisz); + + // the inner product: alpha* + inner_product( + detjwgts*alpha, + Real_array_const_ref{tebasisx}, Real_array_const_ref{tebasisy}, Real_array_const_ref{tebasisz}, + Real_array_const_ref{tebasisx}, Real_array_const_ref{tebasisy}, Real_array_const_ref{tebasisz}, + matrix, true); + + // the inner product: beta* + inner_product( + detjwgts*beta, + Real_array_const_ref{tdbasisx}, Real_array_const_ref{tdbasisy}, Real_array_const_ref{tdbasisz}, + Real_array_const_ref{tdbasisx}, Real_array_const_ref{tdbasisy}, Real_array_const_ref{tdbasisz}, + matrix, true); + } + } + } +} + +} // end namespace rajaperf + +#endif // closing endif for header file include guard diff --git a/src/apps/ENERGY.cpp b/src/apps/ENERGY.cpp index 49fb922c8..13d154487 100644 --- a/src/apps/ENERGY.cpp +++ b/src/apps/ENERGY.cpp @@ -139,5 +139,67 @@ void ENERGY::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_vnewc, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void ENERGY::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + ENERGY_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(ENERGY_BODY1); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(ENERGY_BODY2); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(ENERGY_BODY3); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(ENERGY_BODY4); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(ENERGY_BODY5); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(ENERGY_BODY6); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/ENERGY.hpp b/src/apps/ENERGY.hpp index 57762d9c6..5ff594c16 100644 --- a/src/apps/ENERGY.hpp +++ b/src/apps/ENERGY.hpp @@ -199,6 +199,7 @@ class ENERGY : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/apps/FIR.cpp b/src/apps/FIR.cpp index f9366e56e..2f5080f3b 100644 --- a/src/apps/FIR.cpp +++ b/src/apps/FIR.cpp @@ -83,5 +83,52 @@ void FIR::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_out, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void FIR::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + FIR_COEFF; + + FIR_DATA_SETUP; + + Real_type coeff[FIR_COEFFLEN]; + std::copy(std::begin(coeff_array), std::end(coeff_array), std::begin(coeff)); + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(FIR_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/FIR.hpp b/src/apps/FIR.hpp index 18e817113..97e22d0d2 100644 --- a/src/apps/FIR.hpp +++ b/src/apps/FIR.hpp @@ -74,6 +74,7 @@ class FIR : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/apps/LTIMES_NOVIEW.cpp b/src/apps/LTIMES_NOVIEW.cpp index 416674bb7..0d667f371 100644 --- a/src/apps/LTIMES_NOVIEW.cpp +++ b/src/apps/LTIMES_NOVIEW.cpp @@ -98,5 +98,50 @@ void LTIMES_NOVIEW::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) deallocData(m_psidat, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void LTIMES_NOVIEW::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + LTIMES_NOVIEW_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type z = 0; z < num_z; ++z )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type g = 0; g < num_g; ++g )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type m = 0; m < num_m; ++m )) { + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type d = 0; d < num_d; ++d )) { + RAJAPERF_COUNTERS_LOOP_BODY(LTIMES_NOVIEW_BODY); + } + } + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/LTIMES_NOVIEW.hpp b/src/apps/LTIMES_NOVIEW.hpp index 99b75c9de..af2d4bcb1 100644 --- a/src/apps/LTIMES_NOVIEW.hpp +++ b/src/apps/LTIMES_NOVIEW.hpp @@ -64,6 +64,7 @@ class LTIMES_NOVIEW : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/apps/MASS3DEA.cpp b/src/apps/MASS3DEA.cpp index 097b87bed..99eb6aeaf 100644 --- a/src/apps/MASS3DEA.cpp +++ b/src/apps/MASS3DEA.cpp @@ -95,5 +95,73 @@ void MASS3DEA::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_M, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void MASS3DEA::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + MASS3DEA_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type e = 0; e < NE; ++e)) { + RAJAPERF_COUNTERS_TEAM_CONTEXT(); + + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DEA_0_CPU); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(d, x, mea::D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(q, y, mea::Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DEA_1); + } + } + + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DEA_2_CPU); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(k1, x, mea::Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(k2, y, mea::Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(k3, z, mea::Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DEA_3); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(i1, x, mea::D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(i2, y, mea::D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(i3, z, mea::D1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DEA_4); + } + } + } + + } // element loop + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/MASS3DEA.hpp b/src/apps/MASS3DEA.hpp index dc1d73e1e..06717d7d6 100644 --- a/src/apps/MASS3DEA.hpp +++ b/src/apps/MASS3DEA.hpp @@ -106,15 +106,17 @@ constexpr RAJA::Index_type Q1D = 5; D[qx + mea::Q1D * qy + mea::Q1D * mea::Q1D * qz + \ mea::Q1D * mea::Q1D * mea::Q1D * e] -#define MASS3DEA_0 RAJA_TEAM_SHARED Real_type s_B[mea::Q1D][mea::D1D]; +#define MASS3DEA_0 RAJA_TEAM_SHARED Real_array2 s_B; -#define MASS3DEA_0_CPU Real_type s_B[mea::Q1D][mea::D1D]; +#define MASS3DEA_0_CPU Real_array2 s_B; #define MASS3DEA_1 s_B[q][d] = MEA_B(q, d); -#define MASS3DEA_2 RAJA_TEAM_SHARED Real_type s_D[mea::Q1D][mea::Q1D][mea::Q1D]; +#define MASS3DEA_2 \ + RAJA_TEAM_SHARED Real_array3 s_D; -#define MASS3DEA_2_CPU Real_type s_D[mea::Q1D][mea::Q1D][mea::Q1D]; +#define MASS3DEA_2_CPU \ + Real_array3 s_D; #define MASS3DEA_3 s_D[k1][k2][k3] = MEA_D(k1, k2, k3, e); @@ -153,6 +155,7 @@ class MASS3DEA : public KernelBase { void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/apps/MASS3DPA.cpp b/src/apps/MASS3DPA.cpp index 4d24234f3..da84c5e32 100644 --- a/src/apps/MASS3DPA.cpp +++ b/src/apps/MASS3DPA.cpp @@ -100,5 +100,112 @@ void MASS3DPA::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_Y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void MASS3DPA::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + MASS3DPA_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type e = 0; e < NE; ++e)) { + RAJAPERF_COUNTERS_TEAM_CONTEXT(); + + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DPA_0_CPU); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy, y, mpa::D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx, x, mpa::D1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DPA_1); + } + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx, x, mpa::Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DPA_2); + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy, y, mpa::D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx, x, mpa::Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DPA_3); + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qy, y, mpa::Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx, x, mpa::Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DPA_4); + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qy, y, mpa::Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx, x, mpa::Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DPA_5); + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(d, y, mpa::D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(q, x, mpa::Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DPA_6); + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qy, y, mpa::Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx, x, mpa::D1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DPA_7); + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy, y, mpa::D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx, x, mpa::D1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DPA_8); + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy, y, mpa::D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx, x, mpa::D1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DPA_9); + } + } + + } // element loop + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/MASS3DPA.hpp b/src/apps/MASS3DPA.hpp index b0916e1d5..6ae3e2d3c 100644 --- a/src/apps/MASS3DPA.hpp +++ b/src/apps/MASS3DPA.hpp @@ -188,36 +188,36 @@ constexpr RAJA::Index_type Q1D = 5; mpa::Q1D * mpa::Q1D * mpa::Q1D * e] #define MASS3DPA_0_CPU \ - constexpr Index_type MQ1 = mpa::Q1D; \ - constexpr Index_type MD1 = mpa::D1D; \ - constexpr Index_type MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ - Real_type sDQ[MQ1 * MD1]; \ - Real_type(*Bsmem)[MD1] = (Real_type(*)[MD1])sDQ; \ - Real_type(*Btsmem)[MQ1] = (Real_type(*)[MQ1])sDQ; \ - Real_type sm0[MDQ * MDQ * MDQ]; \ - Real_type sm1[MDQ * MDQ * MDQ]; \ - Real_type(*Xsmem)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm0; \ - Real_type(*DDQ)[MD1][MQ1] = (Real_type(*)[MD1][MQ1])sm1; \ - Real_type(*DQQ)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])sm0; \ - Real_type(*QQQ)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])sm1; \ - Real_type(*QQD)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])sm0; \ - Real_type(*QDD)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm1; + constexpr auto MQ1 = mpa::Q1D; \ + constexpr auto MD1 = mpa::D1D; \ + constexpr auto MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ + Real_array2 sDQ; \ + Real_array2_ref Bsmem(sDQ); \ + Real_array2_ref Btsmem(sDQ); \ + Real_array3 sm0; \ + Real_array3 sm1; \ + Real_array3_ref Xsmem(sm0); \ + Real_array3_ref DDQ(sm1); \ + Real_array3_ref DQQ(sm0); \ + Real_array3_ref QQQ(sm1); \ + Real_array3_ref QQD(sm0); \ + Real_array3_ref QDD(sm1); #define MASS3DPA_0_GPU \ - constexpr Index_type MQ1 = mpa::Q1D; \ - constexpr Index_type MD1 = mpa::D1D; \ - constexpr Index_type MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ - RAJA_TEAM_SHARED Real_type sDQ[MQ1 * MD1]; \ - Real_type(*Bsmem)[MD1] = (Real_type(*)[MD1])sDQ; \ - Real_type(*Btsmem)[MQ1] = (Real_type(*)[MQ1])sDQ; \ - RAJA_TEAM_SHARED Real_type sm0[MDQ * MDQ * MDQ]; \ - RAJA_TEAM_SHARED Real_type sm1[MDQ * MDQ * MDQ]; \ - Real_type(*Xsmem)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm0; \ - Real_type(*DDQ)[MD1][MQ1] = (Real_type(*)[MD1][MQ1])sm1; \ - Real_type(*DQQ)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])sm0; \ - Real_type(*QQQ)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])sm1; \ - Real_type(*QQD)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])sm0; \ - Real_type(*QDD)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm1; + constexpr auto MQ1 = mpa::Q1D; \ + constexpr auto MD1 = mpa::D1D; \ + constexpr auto MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ + RAJA_TEAM_SHARED Real_array2 sDQ; \ + Real_array2_ref Bsmem(sDQ); \ + Real_array2_ref Btsmem(sDQ); \ + RAJA_TEAM_SHARED Real_array3 sm0; \ + RAJA_TEAM_SHARED Real_array3 sm1; \ + Real_array3_ref Xsmem(sm0); \ + Real_array3_ref DDQ(sm1); \ + Real_array3_ref DQQ(sm0); \ + Real_array3_ref QQQ(sm1); \ + Real_array3_ref QQD(sm0); \ + Real_array3_ref QDD(sm1); #define MASS3DPA_1 \ RAJAPERF_UNROLL(MD1) \ @@ -360,6 +360,7 @@ class MASS3DPA : public KernelBase { void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/apps/MASS3DPA_ATOMIC.hpp b/src/apps/MASS3DPA_ATOMIC.hpp index a42f4914f..352bf508f 100644 --- a/src/apps/MASS3DPA_ATOMIC.hpp +++ b/src/apps/MASS3DPA_ATOMIC.hpp @@ -162,35 +162,35 @@ constexpr RAJA::Index_type Q1D = 4; mpa_at::Q1D * mpa_at::Q1D * mpa_at::Q1D * e] #define MASS3DPA_ATOMIC_0_CPU \ - constexpr Index_type MQ1 = mpa_at::Q1D; \ - constexpr Index_type MD1 = mpa_at::D1D; \ - constexpr Index_type MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ - Real_type sm_B[MQ1][MD1]; \ - Real_type sm_Bt[MD1][MQ1]; \ - Real_type sm0[MDQ * MDQ * MDQ]; \ - Real_type sm1[MDQ * MDQ * MDQ]; \ - Real_type(*sm_X)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm0; \ - Real_type(*DDQ)[MD1][MQ1] = (Real_type(*)[MD1][MQ1])sm1; \ - Real_type(*DQQ)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])sm0; \ - Real_type(*QQQ)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])sm1; \ - Real_type(*QQD)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])sm0; \ - Real_type(*QDD)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm1; \ + constexpr auto MQ1 = mpa_at::Q1D; \ + constexpr auto MD1 = mpa_at::D1D; \ + constexpr auto MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ + Real_array2 sm_B; \ + Real_array2 sm_Bt; \ + Real_array3 sm0; \ + Real_array3 sm1; \ + Real_array3_ref sm_X(sm0); \ + Real_array3_ref DDQ(sm1); \ + Real_array3_ref DQQ(sm0); \ + Real_array3_ref QQQ(sm1); \ + Real_array3_ref QQD(sm0); \ + Real_array3_ref QDD(sm1); \ Index_type thread_dofs[MD1 * MD1 * MD1]; #define MASS3DPA_ATOMIC_0_GPU \ - constexpr Index_type MQ1 = mpa_at::Q1D; \ - constexpr Index_type MD1 = mpa_at::D1D; \ - constexpr Index_type MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ - RAJA_TEAM_SHARED Real_type sm_B[MQ1][MD1]; \ - RAJA_TEAM_SHARED Real_type sm_Bt[MD1][MQ1]; \ - RAJA_TEAM_SHARED Real_type sm0[MDQ * MDQ * MDQ]; \ - RAJA_TEAM_SHARED Real_type sm1[MDQ * MDQ * MDQ]; \ - Real_type(*sm_X)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm0; \ - Real_type(*DDQ)[MD1][MQ1] = (Real_type(*)[MD1][MQ1])sm1; \ - Real_type(*DQQ)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])sm0; \ - Real_type(*QQQ)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])sm1; \ - Real_type(*QQD)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])sm0; \ - Real_type(*QDD)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm1; \ + constexpr auto MQ1 = mpa_at::Q1D; \ + constexpr auto MD1 = mpa_at::D1D; \ + constexpr auto MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ + RAJA_TEAM_SHARED Real_array2 sm_B; \ + RAJA_TEAM_SHARED Real_array2 sm_Bt; \ + RAJA_TEAM_SHARED Real_array3 sm0; \ + RAJA_TEAM_SHARED Real_array3 sm1; \ + Real_array3_ref sm_X(sm0); \ + Real_array3_ref DDQ(sm1); \ + Real_array3_ref DQQ(sm0); \ + Real_array3_ref QQQ(sm1); \ + Real_array3_ref QQD(sm0); \ + Real_array3_ref QDD(sm1); \ RAJA_TEAM_SHARED Index_type thread_dofs[MD1 * MD1 * MD1]; #define MASS3DPA_ATOMIC_1 \ diff --git a/src/apps/MASSVEC3DPA.hpp b/src/apps/MASSVEC3DPA.hpp index af0745234..719a1d909 100644 --- a/src/apps/MASSVEC3DPA.hpp +++ b/src/apps/MASSVEC3DPA.hpp @@ -159,34 +159,34 @@ constexpr RAJA::Index_type DIM = 3; mvpa::Q1D * mvpa::Q1D * mvpa::Q1D * e] #define MASSVEC3DPA_0_CPU \ - constexpr Index_type MQ1 = mvpa::Q1D; \ - constexpr Index_type MD1 = mvpa::D1D; \ - constexpr Index_type MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ - /*RAJA_TEAM_SHARED*/ Real_type smB[MQ1][MD1]; \ - /*RAJA_TEAM_SHARED*/ Real_type smBt[MD1][MQ1]; \ - /*RAJA_TEAM_SHARED*/ Real_type sm0[MDQ * MDQ * MDQ]; \ - /*RAJA_TEAM_SHARED*/ Real_type sm1[MDQ * MDQ * MDQ]; \ - Real_type(*smX)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm0; \ - Real_type(*DDQ)[MD1][MQ1] = (Real_type(*)[MD1][MQ1])sm1; \ - Real_type(*DQQ)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])sm0; \ - Real_type(*QQQ)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])sm1; \ - Real_type(*QQD)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])sm0; \ - Real_type(*QDD)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm1; + constexpr auto MQ1 = mvpa::Q1D; \ + constexpr auto MD1 = mvpa::D1D; \ + constexpr auto MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ + /*RAJA_TEAM_SHARED*/ Real_array2 smB; \ + /*RAJA_TEAM_SHARED*/ Real_array2 smBt; \ + /*RAJA_TEAM_SHARED*/ Real_array3 sm0; \ + /*RAJA_TEAM_SHARED*/ Real_array3 sm1; \ + Real_array3_ref smX(sm0); \ + Real_array3_ref DDQ(sm1); \ + Real_array3_ref DQQ(sm0); \ + Real_array3_ref QQQ(sm1); \ + Real_array3_ref QQD(sm0); \ + Real_array3_ref QDD(sm1); #define MASSVEC3DPA_0_GPU \ - constexpr Index_type MQ1 = mvpa::Q1D; \ - constexpr Index_type MD1 = mvpa::D1D; \ - constexpr Index_type MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ - RAJA_TEAM_SHARED Real_type smB[MQ1][MD1]; \ - RAJA_TEAM_SHARED Real_type smBt[MD1][MQ1]; \ - RAJA_TEAM_SHARED Real_type sm0[MDQ * MDQ * MDQ]; \ - RAJA_TEAM_SHARED Real_type sm1[MDQ * MDQ * MDQ]; \ - Real_type(*smX)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm0; \ - Real_type(*DDQ)[MD1][MQ1] = (Real_type(*)[MD1][MQ1])sm1; \ - Real_type(*DQQ)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])sm0; \ - Real_type(*QQQ)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])sm1; \ - Real_type(*QQD)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])sm0; \ - Real_type(*QDD)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm1; + constexpr auto MQ1 = mvpa::Q1D; \ + constexpr auto MD1 = mvpa::D1D; \ + constexpr auto MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ + RAJA_TEAM_SHARED Real_array2 smB; \ + RAJA_TEAM_SHARED Real_array2 smBt; \ + RAJA_TEAM_SHARED Real_array3 sm0; \ + RAJA_TEAM_SHARED Real_array3 sm1; \ + Real_array3_ref smX(sm0); \ + Real_array3_ref DDQ(sm1); \ + Real_array3_ref DQQ(sm0); \ + Real_array3_ref QQQ(sm1); \ + Real_array3_ref QQD(sm0); \ + Real_array3_ref QDD(sm1); #define MASSVEC3DPA_1 \ Real_type r_smB = MVPA_B(q, d); \ diff --git a/src/apps/MATVEC_3D_STENCIL.cpp b/src/apps/MATVEC_3D_STENCIL.cpp index 8ef901d9c..08079590f 100644 --- a/src/apps/MATVEC_3D_STENCIL.cpp +++ b/src/apps/MATVEC_3D_STENCIL.cpp @@ -177,5 +177,48 @@ void MATVEC_3D_STENCIL::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ deallocData(m_real_zones, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void MATVEC_3D_STENCIL::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + MATVEC_3D_STENCIL_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type ii = ibegin ; ii < iend ; ++ii )) { + RAJAPERF_COUNTERS_LOOP_BODY(MATVEC_3D_STENCIL_BODY_INDEX); + RAJAPERF_COUNTERS_LOOP_BODY(MATVEC_3D_STENCIL_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/MATVEC_3D_STENCIL.hpp b/src/apps/MATVEC_3D_STENCIL.hpp index 485688785..7556a7bdc 100644 --- a/src/apps/MATVEC_3D_STENCIL.hpp +++ b/src/apps/MATVEC_3D_STENCIL.hpp @@ -133,6 +133,7 @@ class MATVEC_3D_STENCIL : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/apps/NODAL_ACCUMULATION_3D.cpp b/src/apps/NODAL_ACCUMULATION_3D.cpp index 874069cf5..1de1a493e 100644 --- a/src/apps/NODAL_ACCUMULATION_3D.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D.cpp @@ -99,5 +99,48 @@ void NODAL_ACCUMULATION_3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t deallocData(m_real_zones, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void NODAL_ACCUMULATION_3D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + NODAL_ACCUMULATION_3D_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type ii = ibegin ; ii < iend ; ++ii )) { + RAJAPERF_COUNTERS_LOOP_BODY(NODAL_ACCUMULATION_3D_BODY_INDEX); + RAJAPERF_COUNTERS_LOOP_BODY(NODAL_ACCUMULATION_3D_BODY(RAJAPERF_ATOMIC_ADD_COUNTING)); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, 0); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/NODAL_ACCUMULATION_3D.hpp b/src/apps/NODAL_ACCUMULATION_3D.hpp index cb1a646ff..0d766cf5f 100644 --- a/src/apps/NODAL_ACCUMULATION_3D.hpp +++ b/src/apps/NODAL_ACCUMULATION_3D.hpp @@ -81,6 +81,7 @@ class NODAL_ACCUMULATION_3D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/apps/PRESSURE.cpp b/src/apps/PRESSURE.cpp index 0a56491b4..4104d3cf8 100644 --- a/src/apps/PRESSURE.cpp +++ b/src/apps/PRESSURE.cpp @@ -93,5 +93,51 @@ void PRESSURE::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_vnewc, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void PRESSURE::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + PRESSURE_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(PRESSURE_BODY1); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(PRESSURE_OPT_BODY2); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/PRESSURE.hpp b/src/apps/PRESSURE.hpp index ccf817fe8..a5125cde3 100644 --- a/src/apps/PRESSURE.hpp +++ b/src/apps/PRESSURE.hpp @@ -45,6 +45,13 @@ if ( vnewc[i] >= eosvmax ) p_new[i] = 0.0 ; \ if ( p_new[i] < pmin ) p_new[i] = pmin ; +#define PRESSURE_OPT_BODY2 \ + Real_type p = bvc[i] * e_old[i] ; \ + if ( fabs(p) < p_cut ) p = 0.0 ; \ + if ( vnewc[i] >= eosvmax ) p = 0.0 ; \ + if ( p < pmin ) p = pmin ; \ + p_new[i] = p; + #include "common/KernelBase.hpp" @@ -67,6 +74,7 @@ class PRESSURE : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/apps/VOL3D.cpp b/src/apps/VOL3D.cpp index c9dda5682..96355cff5 100644 --- a/src/apps/VOL3D.cpp +++ b/src/apps/VOL3D.cpp @@ -100,5 +100,47 @@ void VOL3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_vol, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void VOL3D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = m_domain->fpz; + const Index_type iend = m_domain->lpz+1; + + VOL3D_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin ; i < iend ; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(VOL3D_OPT_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/VOL3D.hpp b/src/apps/VOL3D.hpp index 1fa37e43a..16a5af09f 100644 --- a/src/apps/VOL3D.hpp +++ b/src/apps/VOL3D.hpp @@ -137,6 +137,59 @@ \ vol[i] *= vnormq ; +#define VOL3D_OPT_BODY \ + Real_type x71 = x7[i] - x1[i] ; \ + Real_type x72 = x7[i] - x2[i] ; \ + Real_type x74 = x7[i] - x4[i] ; \ + Real_type x30 = x3[i] - x0[i] ; \ + Real_type x50 = x5[i] - x0[i] ; \ + Real_type x60 = x6[i] - x0[i] ; \ + \ + Real_type y71 = y7[i] - y1[i] ; \ + Real_type y72 = y7[i] - y2[i] ; \ + Real_type y74 = y7[i] - y4[i] ; \ + Real_type y30 = y3[i] - y0[i] ; \ + Real_type y50 = y5[i] - y0[i] ; \ + Real_type y60 = y6[i] - y0[i] ; \ + \ + Real_type z71 = z7[i] - z1[i] ; \ + Real_type z72 = z7[i] - z2[i] ; \ + Real_type z74 = z7[i] - z4[i] ; \ + Real_type z30 = z3[i] - z0[i] ; \ + Real_type z50 = z5[i] - z0[i] ; \ + Real_type z60 = z6[i] - z0[i] ; \ + \ + Real_type xps = x71 + x60 ; \ + Real_type yps = y71 + y60 ; \ + Real_type zps = z71 + z60 ; \ + \ + Real_type cyz = y72 * z30 - z72 * y30 ; \ + Real_type czx = z72 * x30 - x72 * z30 ; \ + Real_type cxy = x72 * y30 - y72 * x30 ; \ + Real_type v = xps * cyz + yps * czx + zps * cxy ; \ + \ + xps = x72 + x50 ; \ + yps = y72 + y50 ; \ + zps = z72 + z50 ; \ + \ + cyz = y74 * z60 - z74 * y60 ; \ + czx = z74 * x60 - x74 * z60 ; \ + cxy = x74 * y60 - y74 * x60 ; \ + v += xps * cyz + yps * czx + zps * cxy ; \ + \ + xps = x74 + x30 ; \ + yps = y74 + y30 ; \ + zps = z74 + z30 ; \ + \ + cyz = y71 * z50 - z71 * y50 ; \ + czx = z71 * x50 - x71 * z50 ; \ + cxy = x71 * y50 - y71 * x50 ; \ + v += xps * cyz + yps * czx + zps * cxy ; \ + \ + v *= vnormq ; \ + \ + vol[i] = v ; + #include "common/KernelBase.hpp" @@ -160,6 +213,7 @@ class VOL3D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/apps/ZONAL_ACCUMULATION_3D.cpp b/src/apps/ZONAL_ACCUMULATION_3D.cpp index 9b94d1e4d..61f0a0724 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D.cpp +++ b/src/apps/ZONAL_ACCUMULATION_3D.cpp @@ -98,5 +98,48 @@ void ZONAL_ACCUMULATION_3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t deallocData(m_real_zones, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void ZONAL_ACCUMULATION_3D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + ZONAL_ACCUMULATION_3D_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type ii = ibegin ; ii < iend ; ++ii )) { + RAJAPERF_COUNTERS_LOOP_BODY(ZONAL_ACCUMULATION_3D_BODY_INDEX); + RAJAPERF_COUNTERS_LOOP_BODY(ZONAL_ACCUMULATION_3D_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/ZONAL_ACCUMULATION_3D.hpp b/src/apps/ZONAL_ACCUMULATION_3D.hpp index 3a1e1ca5f..747c45ccd 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D.hpp +++ b/src/apps/ZONAL_ACCUMULATION_3D.hpp @@ -77,6 +77,7 @@ class ZONAL_ACCUMULATION_3D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/apps/mixed_fem_helper.hpp b/src/apps/mixed_fem_helper.hpp index e13717332..88ef0289f 100644 --- a/src/apps/mixed_fem_helper.hpp +++ b/src/apps/mixed_fem_helper.hpp @@ -7,21 +7,13 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#ifndef MIXED_FEM_HELPER -#define MIXED_FEM_HELPER +#ifndef RAJAPerf_Apps_MIXED_FEM_HELPER +#define RAJAPerf_Apps_MIXED_FEM_HELPER -#include "RAJA/RAJA.hpp" - -#include "common/RPTypes.hpp" - -#include - -#define NB 8 -#define EB 12 -#define FB 6 -#define MAX_QUAD_ORDER 5 +namespace rajaperf +{ -constexpr rajaperf::Real_type ptiny = 1.0e-50; +constexpr Real_type_t ptiny = 1.0e-50; // // Common FEM functions @@ -29,15 +21,14 @@ constexpr rajaperf::Real_type ptiny = 1.0e-50; RAJA_HOST_DEVICE RAJA_INLINE void LinAlg_qrule_Lobatto( - rajaperf::Int_type order, - rajaperf::Real_type *qpts1D, - rajaperf::Real_type *wgts1D) + Int_type order, + Real_ptr qpts1D, + Real_ptr wgts1D) { // Define the Gauss-Lobatto quadrature points and weights over the // 1D domain [0,1] for rules up to order 5 - switch( order ) { - case 1 : + if (order == 1) { // Order 1 Gauss-Lobatto Points qpts1D[0] = 0.5; @@ -45,9 +36,7 @@ RAJA_INLINE void LinAlg_qrule_Lobatto( // Order 1 Gauss-Lobatto Weights wgts1D[0] = 1.0; - break; - - case 2 : + } else if (order == 2) { // Order 2 Gauss-Lobatto Points qpts1D[0] = 0.0; @@ -57,9 +46,7 @@ RAJA_INLINE void LinAlg_qrule_Lobatto( wgts1D[0] = 0.5; wgts1D[1] = 0.5; - break; - - case 3 : + } else if (order == 3) { // Order 3 Gauss-Lobatto Points qpts1D[0] = 0.0; @@ -71,9 +58,7 @@ RAJA_INLINE void LinAlg_qrule_Lobatto( wgts1D[1] = 0.666666666666666667; wgts1D[2] = 0.166666666666666667; - break; - - case 4 : + } else if (order == 4) { // Order 4 Gauss-Lobatto Points qpts1D[0] = 0.0; @@ -87,9 +72,7 @@ RAJA_INLINE void LinAlg_qrule_Lobatto( wgts1D[2] = 0.416666666666666667; wgts1D[3] = 0.0833333333333333333; - break; - - case 5 : + } else if (order == 5) { // Order 5 Gauss-Lobatto Points qpts1D[0] = 0.0; @@ -105,23 +88,19 @@ RAJA_INLINE void LinAlg_qrule_Lobatto( wgts1D[3] = 0.272222222222222222; wgts1D[4] = 0.05; - break; - } } RAJA_HOST_DEVICE RAJA_INLINE void LinAlg_qrule_Legendre( - rajaperf::Int_type order, - rajaperf::Real_type *qpts1D, - rajaperf::Real_type *wgts1D) + Int_type order, + Real_ptr qpts1D, + Real_ptr wgts1D) { // Define the Gauss-Legendre quadrature points and weights over the // 1D domain [0,1] for rules up to order 5 - switch( order ) { - - case 1 : + if (order == 1) { // Order 1 Gauss-Legendre Points qpts1D[0] = 0.5; @@ -129,9 +108,7 @@ RAJA_INLINE void LinAlg_qrule_Legendre( // Order 1 Gauss-Legendre Weights wgts1D[0] = 1.0; - break; - - case 2 : + } else if (order == 2) { // Order 2 Gauss-Legendre Points qpts1D[0] = 0.211324865405187118; @@ -141,9 +118,7 @@ RAJA_INLINE void LinAlg_qrule_Legendre( wgts1D[0] = 0.5; wgts1D[1] = 0.5; - break; - - case 3 : + } else if (order == 3) { // Order 3 Gauss-Legendre Points qpts1D[0] = 0.112701665379258311; @@ -155,9 +130,7 @@ RAJA_INLINE void LinAlg_qrule_Legendre( wgts1D[1] = 0.444444444444444444; wgts1D[2] = 0.277777777777777778; - break; - - case 4 : + } else if (order == 4) { // Order 4 Gauss-Legendre Points qpts1D[0] = 0.0694318442029737124; @@ -171,9 +144,7 @@ RAJA_INLINE void LinAlg_qrule_Legendre( wgts1D[2] = 0.326072577431273071; wgts1D[3] = 0.173927422568726929; - break; - - case 5 : + } else if (order == 5) { // Order 5 Gauss-Legendre Points qpts1D[0] = 0.0469100770306680036; @@ -189,76 +160,72 @@ RAJA_INLINE void LinAlg_qrule_Legendre( wgts1D[3] = 0.239314335249683234; wgts1D[4] = 0.118463442528094544; - break; - } } RAJA_HOST_DEVICE RAJA_INLINE void get_quadrature_rule( - const rajaperf::Int_type quad_type, - const rajaperf::Int_type quad_order, - rajaperf::Real_type (&qpts_1d)[MAX_QUAD_ORDER], - rajaperf::Real_type (&wgts_1d)[MAX_QUAD_ORDER]) + const Int_type quad_type, + const Int_type quad_order, + Real_array_ref qpts_1d, + Real_array_ref wgts_1d) { // Generate the 1D set of points and weights over the interval [0,1] - switch( quad_type ) { + if (quad_type == 0) { - case 0 : LinAlg_qrule_Lobatto(quad_order, qpts_1d, wgts_1d); - break; - case 1 : + } else if (quad_type == 1) { + LinAlg_qrule_Legendre(quad_order, qpts_1d, wgts_1d); - break; } } -constexpr rajaperf::Int_type flops_compute_detj() +inline Int_type flops_compute_detj() { return 17; } RAJA_HOST_DEVICE -constexpr rajaperf::Real_type compute_detj( - const rajaperf::Real_type jxx, - const rajaperf::Real_type jxy, - const rajaperf::Real_type jxz, - const rajaperf::Real_type jyx, - const rajaperf::Real_type jyy, - const rajaperf::Real_type jyz, - const rajaperf::Real_type jzx, - const rajaperf::Real_type jzy, - const rajaperf::Real_type jzz) +inline Real_type compute_detj( + const Real_type jxx, + const Real_type jxy, + const Real_type jxz, + const Real_type jyx, + const Real_type jyy, + const Real_type jyz, + const Real_type jzx, + const Real_type jzy, + const Real_type jzz) { return jxy*jyz*jzx - jxz*jyy*jzx + jxz*jyx*jzy - jxx*jyz*jzy - jxy*jyx*jzz + jxx*jyy*jzz; } -template -RAJA_HOST_DEVICE -constexpr void transform_basis( - const rajaperf::Real_type txx, - const rajaperf::Real_type txy, - const rajaperf::Real_type txz, - const rajaperf::Real_type tyx, - const rajaperf::Real_type tyy, - const rajaperf::Real_type tyz, - const rajaperf::Real_type tzx, - const rajaperf::Real_type tzy, - const rajaperf::Real_type tzz, - const rajaperf::Real_type (&basis_x)[M], - const rajaperf::Real_type (&basis_y)[M], - const rajaperf::Real_type (&basis_z)[M], - rajaperf::Real_type (&tbasis_x)[M], - rajaperf::Real_type (&tbasis_y)[M], - rajaperf::Real_type (&tbasis_z)[M]) +template +RAJA_HOST_DEVICE +inline void transform_basis( + const Real_type txx, + const Real_type txy, + const Real_type txz, + const Real_type tyx, + const Real_type tyy, + const Real_type tyz, + const Real_type tzx, + const Real_type tzy, + const Real_type tzz, + Real_array_const_ref basis_x, + Real_array_const_ref basis_y, + Real_array_const_ref basis_z, + Real_array_ref tbasis_x, + Real_array_ref tbasis_y, + Real_array_ref tbasis_z) { // Compute transformed basis function gradients - for (rajaperf::Int_type m = 0; m < M; m++) + for (Int_type m = 0; m < M; m++) { tbasis_x[m] = txx*basis_x[m] + txy*basis_y[m] + txz*basis_z[m]; tbasis_y[m] = tyx*basis_x[m] + tyy*basis_y[m] + tyz*basis_z[m]; @@ -266,39 +233,39 @@ constexpr void transform_basis( } } -template -constexpr rajaperf::Int_type flops_inner_product(const bool is_symmetric) +template +inline Int_type flops_inner_product(const bool is_symmetric) { return is_symmetric ? 7*P*(M+1)/2 : 7*P*M; } -template +template RAJA_HOST_DEVICE -constexpr void inner_product( - const rajaperf::Real_type weight, - const rajaperf::Real_type (&basis_1_x)[M], - const rajaperf::Real_type (&basis_1_y)[M], - const rajaperf::Real_type (&basis_1_z)[M], - const rajaperf::Real_type (&basis_2_x)[P], - const rajaperf::Real_type (&basis_2_y)[P], - const rajaperf::Real_type (&basis_2_z)[P], - rajaperf::Real_type (&matrix)[P][M], +inline void inner_product( + const Real_type weight, + Real_array_const_ref basis_1_x, + Real_array_const_ref basis_1_y, + Real_array_const_ref basis_1_z, + Real_array_const_ref

basis_2_x, + Real_array_const_ref

basis_2_y, + Real_array_const_ref

basis_2_z, + Real_array2_ref matrix, const bool is_symmetric) { // inner product is - for (rajaperf::Int_type p = 0; p < P; p++) { + for (Int_type p = 0; p < P; p++) { - const rajaperf::Real_type txi = basis_2_x[p]; - const rajaperf::Real_type tyi = basis_2_y[p]; - const rajaperf::Real_type tzi = basis_2_z[p]; + const Real_type txi = basis_2_x[p]; + const Real_type tyi = basis_2_y[p]; + const Real_type tzi = basis_2_z[p]; - const rajaperf::Int_type m0 = (is_symmetric && (M == P)) ? p : 0; + const Int_type m0 = (is_symmetric && (M == P)) ? p : Int_type{0}; - for (rajaperf::Int_type m = m0; m < M; m++) { + for (Int_type m = m0; m < M; m++) { - const rajaperf::Real_type txj = basis_1_x[m]; - const rajaperf::Real_type tyj = basis_1_y[m]; - const rajaperf::Real_type tzj = basis_1_z[m]; + const Real_type txj = basis_1_x[m]; + const Real_type tyj = basis_1_y[m]; + const Real_type tzj = basis_1_z[m]; matrix[p][m] += weight*(txi*txj + tyi*tyj + tzi*tzj); @@ -310,19 +277,19 @@ constexpr void inner_product( } } -constexpr rajaperf::Int_type flops_bad_zone_algorithm() +inline Int_type flops_bad_zone_algorithm() { return 3; } RAJA_HOST_DEVICE RAJA_INLINE void bad_zone_algorithm( - const rajaperf::Real_type detj_unfixed, - const rajaperf::Real_type detj_cc, - const rajaperf::Real_type detj_tol, - rajaperf::Real_type& detj, - rajaperf::Real_type& abs_detj, - rajaperf::Real_type& inv_detj) + const Real_type detj_unfixed, + const Real_type detj_cc, + const Real_type detj_tol, + Real_type& detj, + Real_type& abs_detj, + Real_type& inv_detj) { detj = (fabs( detj_unfixed/detj_cc ) < detj_tol) ? detj_cc : detj_unfixed ; @@ -333,37 +300,37 @@ RAJA_INLINE void bad_zone_algorithm( inv_detj = 1.0/(detj + ptiny); } -constexpr rajaperf::Int_type flops_jacobian_inv() +inline Int_type flops_jacobian_inv() { return flops_compute_detj() + flops_bad_zone_algorithm() + 4*9; } RAJA_HOST_DEVICE RAJA_INLINE void jacobian_inv( - const rajaperf::Real_type jxx, - const rajaperf::Real_type jxy, - const rajaperf::Real_type jxz, - const rajaperf::Real_type jyx, - const rajaperf::Real_type jyy, - const rajaperf::Real_type jyz, - const rajaperf::Real_type jzx, - const rajaperf::Real_type jzy, - const rajaperf::Real_type jzz, - const rajaperf::Real_type detj_cc, - const rajaperf::Real_type detj_tol, - rajaperf::Real_type &jinvxx, - rajaperf::Real_type &jinvxy, - rajaperf::Real_type &jinvxz, - rajaperf::Real_type &jinvyx, - rajaperf::Real_type &jinvyy, - rajaperf::Real_type &jinvyz, - rajaperf::Real_type &jinvzx, - rajaperf::Real_type &jinvzy, - rajaperf::Real_type &jinvzz, - rajaperf::Real_type &detj_unfixed, - rajaperf::Real_type &detj, - rajaperf::Real_type &abs_detj, - rajaperf::Real_type &inv_detj) + const Real_type jxx, + const Real_type jxy, + const Real_type jxz, + const Real_type jyx, + const Real_type jyy, + const Real_type jyz, + const Real_type jzx, + const Real_type jzy, + const Real_type jzz, + const Real_type detj_cc, + const Real_type detj_tol, + Real_type &jinvxx, + Real_type &jinvxy, + Real_type &jinvxz, + Real_type &jinvyx, + Real_type &jinvyy, + Real_type &jinvyz, + Real_type &jinvzx, + Real_type &jinvzy, + Real_type &jinvzz, + Real_type &detj_unfixed, + Real_type &detj, + Real_type &abs_detj, + Real_type &inv_detj) { // Compute determinant of Jacobian matrix at this quadrature point detj_unfixed = compute_detj(jxx, jxy, jxz, @@ -386,14 +353,14 @@ RAJA_INLINE void jacobian_inv( } RAJA_HOST_DEVICE -constexpr rajaperf::Real_type Jzx( - const rajaperf::Real_type (&x)[NB], - const rajaperf::Real_type (&RAJA_UNUSED_ARG(y))[NB], - const rajaperf::Real_type (&RAJA_UNUSED_ARG(z))[NB], - const rajaperf::Real_type tmpxy, - const rajaperf::Real_type xloctmpy, - const rajaperf::Real_type xyloc, - const rajaperf::Real_type tmpxyloc) +inline Real_type Jzx( + Real_array_const_ref x, + Real_array_const_ref RAJA_UNUSED_ARG(y), + Real_array_const_ref RAJA_UNUSED_ARG(z), + const Real_type tmpxy, + const Real_type xloctmpy, + const Real_type xyloc, + const Real_type tmpxyloc) { return (x[4] - x[0])*tmpxy + (x[5] - x[1])*xloctmpy + @@ -402,14 +369,14 @@ constexpr rajaperf::Real_type Jzx( } RAJA_HOST_DEVICE -constexpr rajaperf::Real_type Jzy( - const rajaperf::Real_type (&RAJA_UNUSED_ARG(x))[NB], - const rajaperf::Real_type (&y)[NB], - const rajaperf::Real_type (&RAJA_UNUSED_ARG(z))[NB], - const rajaperf::Real_type tmpxy, - const rajaperf::Real_type xloctmpy, - const rajaperf::Real_type xyloc, - const rajaperf::Real_type tmpxyloc) +inline Real_type Jzy( + Real_array_const_ref RAJA_UNUSED_ARG(x), + Real_array_const_ref y, + Real_array_const_ref RAJA_UNUSED_ARG(z), + const Real_type tmpxy, + const Real_type xloctmpy, + const Real_type xyloc, + const Real_type tmpxyloc) { return (y[4] - y[0])*tmpxy + (y[5] - y[1])*xloctmpy + @@ -418,14 +385,14 @@ constexpr rajaperf::Real_type Jzy( } RAJA_HOST_DEVICE -constexpr rajaperf::Real_type Jzz( - const rajaperf::Real_type (&RAJA_UNUSED_ARG(x))[NB], - const rajaperf::Real_type (&RAJA_UNUSED_ARG(y))[NB], - const rajaperf::Real_type (&z)[NB], - const rajaperf::Real_type tmpxy, - const rajaperf::Real_type xloctmpy, - const rajaperf::Real_type xyloc, - const rajaperf::Real_type tmpxyloc) +inline Real_type Jzz( + Real_array_const_ref RAJA_UNUSED_ARG(x), + Real_array_const_ref RAJA_UNUSED_ARG(y), + Real_array_const_ref z, + const Real_type tmpxy, + const Real_type xloctmpy, + const Real_type xyloc, + const Real_type tmpxyloc) { return (z[4] - z[0])*tmpxy + (z[5] - z[1])*xloctmpy + @@ -433,20 +400,20 @@ constexpr rajaperf::Real_type Jzz( (z[7] - z[3])*tmpxyloc; } -constexpr rajaperf::Int_type flops_Jxx() +inline Int_type flops_Jxx() { return 11; } RAJA_HOST_DEVICE -constexpr rajaperf::Real_type Jxx( - const rajaperf::Real_type (&x)[NB], - const rajaperf::Real_type (&RAJA_UNUSED_ARG(y))[NB], - const rajaperf::Real_type (&RAJA_UNUSED_ARG(z))[NB], - const rajaperf::Real_type tmpyz, - const rajaperf::Real_type yloctmpz, - const rajaperf::Real_type tmpyzloc, - const rajaperf::Real_type yzloc) +inline Real_type Jxx( + Real_array_const_ref x, + Real_array_const_ref RAJA_UNUSED_ARG(y), + Real_array_const_ref RAJA_UNUSED_ARG(z), + const Real_type tmpyz, + const Real_type yloctmpz, + const Real_type tmpyzloc, + const Real_type yzloc) { return (x[1] - x[0])*tmpyz + (x[2] - x[3])*yloctmpz + @@ -455,14 +422,14 @@ constexpr rajaperf::Real_type Jxx( } RAJA_HOST_DEVICE -constexpr rajaperf::Real_type Jxy( - const rajaperf::Real_type (&RAJA_UNUSED_ARG(x))[NB], - const rajaperf::Real_type (&y)[NB], - const rajaperf::Real_type (&RAJA_UNUSED_ARG(z))[NB], - const rajaperf::Real_type tmpyz, - const rajaperf::Real_type yloctmpz, - const rajaperf::Real_type tmpyzloc, - const rajaperf::Real_type yzloc) +inline Real_type Jxy( + Real_array_const_ref RAJA_UNUSED_ARG(x), + Real_array_const_ref y, + Real_array_const_ref RAJA_UNUSED_ARG(z), + const Real_type tmpyz, + const Real_type yloctmpz, + const Real_type tmpyzloc, + const Real_type yzloc) { return (y[1] - y[0])*tmpyz + (y[2] - y[3])*yloctmpz + @@ -471,14 +438,14 @@ constexpr rajaperf::Real_type Jxy( } RAJA_HOST_DEVICE -constexpr rajaperf::Real_type Jxz( - const rajaperf::Real_type (&RAJA_UNUSED_ARG(x))[NB], - const rajaperf::Real_type (&RAJA_UNUSED_ARG(y))[NB], - const rajaperf::Real_type (&z)[NB], - const rajaperf::Real_type tmpyz, - const rajaperf::Real_type yloctmpz, - const rajaperf::Real_type tmpyzloc, - const rajaperf::Real_type yzloc) +inline Real_type Jxz( + Real_array_const_ref RAJA_UNUSED_ARG(x), + Real_array_const_ref RAJA_UNUSED_ARG(y), + Real_array_const_ref z, + const Real_type tmpyz, + const Real_type yloctmpz, + const Real_type tmpyzloc, + const Real_type yzloc) { return (z[1] - z[0])*tmpyz + (z[2] - z[3])*yloctmpz + @@ -487,14 +454,14 @@ constexpr rajaperf::Real_type Jxz( } RAJA_HOST_DEVICE -constexpr rajaperf::Real_type Jyx( - const rajaperf::Real_type (&x)[NB], - const rajaperf::Real_type (&RAJA_UNUSED_ARG(y))[NB], - const rajaperf::Real_type (&RAJA_UNUSED_ARG(z))[NB], - const rajaperf::Real_type tmpxz, - const rajaperf::Real_type xloctmpz, - const rajaperf::Real_type tmpxzloc, - const rajaperf::Real_type xzloc) +inline Real_type Jyx( + Real_array_const_ref x, + Real_array_const_ref RAJA_UNUSED_ARG(y), + Real_array_const_ref RAJA_UNUSED_ARG(z), + const Real_type tmpxz, + const Real_type xloctmpz, + const Real_type tmpxzloc, + const Real_type xzloc) { return (x[3] - x[0])*tmpxz + (x[2] - x[1])*xloctmpz + @@ -503,14 +470,14 @@ constexpr rajaperf::Real_type Jyx( } RAJA_HOST_DEVICE -constexpr rajaperf::Real_type Jyy( - const rajaperf::Real_type (&RAJA_UNUSED_ARG(x))[NB], - const rajaperf::Real_type (&y)[NB], - const rajaperf::Real_type (&RAJA_UNUSED_ARG(z))[NB], - const rajaperf::Real_type tmpxz, - const rajaperf::Real_type xloctmpz, - const rajaperf::Real_type tmpxzloc, - const rajaperf::Real_type xzloc) +inline Real_type Jyy( + Real_array_const_ref RAJA_UNUSED_ARG(x), + Real_array_const_ref y, + Real_array_const_ref RAJA_UNUSED_ARG(z), + const Real_type tmpxz, + const Real_type xloctmpz, + const Real_type tmpxzloc, + const Real_type xzloc) { return (y[3] - y[0])*tmpxz + (y[2] - y[1])*xloctmpz + @@ -519,14 +486,14 @@ constexpr rajaperf::Real_type Jyy( } RAJA_HOST_DEVICE -constexpr rajaperf::Real_type Jyz( - const rajaperf::Real_type (&RAJA_UNUSED_ARG(x))[NB], - const rajaperf::Real_type (&RAJA_UNUSED_ARG(y))[NB], - const rajaperf::Real_type (&z)[NB], - const rajaperf::Real_type tmpxz, - const rajaperf::Real_type xloctmpz, - const rajaperf::Real_type tmpxzloc, - const rajaperf::Real_type xzloc) +inline Real_type Jyz( + Real_array_const_ref RAJA_UNUSED_ARG(x), + Real_array_const_ref RAJA_UNUSED_ARG(y), + Real_array_const_ref z, + const Real_type tmpxz, + const Real_type xloctmpz, + const Real_type tmpxzloc, + const Real_type xzloc) { return (z[3] - z[0])*tmpxz + (z[2] - z[1])*xloctmpz + @@ -538,14 +505,14 @@ constexpr rajaperf::Real_type Jyz( // Node basis //----------------------------------------- RAJA_HOST_DEVICE -constexpr void nodebasis( - rajaperf::Real_type (&basis)[NB], - const rajaperf::Real_type tmpxy, - const rajaperf::Real_type xloctmpy, - const rajaperf::Real_type xyloc, - const rajaperf::Real_type tmpxyloc, - const rajaperf::Real_type zloc, - const rajaperf::Real_type tmpz) +inline void nodebasis( + Real_array_ref basis, + const Real_type tmpxy, + const Real_type xloctmpy, + const Real_type xyloc, + const Real_type tmpxyloc, + const Real_type zloc, + const Real_type tmpz) { basis[0] = tmpxy*tmpz; basis[1] = xloctmpy*tmpz; @@ -558,12 +525,12 @@ constexpr void nodebasis( } RAJA_HOST_DEVICE -constexpr void dnodebasis_dx( - rajaperf::Real_type (&dbasis)[NB], - const rajaperf::Real_type tmpyz, - const rajaperf::Real_type yloctmpz, - const rajaperf::Real_type tmpyzloc, - const rajaperf::Real_type yzloc) +inline void dnodebasis_dx( + Real_array_ref dbasis, + const Real_type tmpyz, + const Real_type yloctmpz, + const Real_type tmpyzloc, + const Real_type yzloc) { dbasis[0] = -tmpyz; dbasis[1] = tmpyz; @@ -576,12 +543,12 @@ constexpr void dnodebasis_dx( } RAJA_HOST_DEVICE -constexpr void dnodebasis_dy( - rajaperf::Real_type (&dbasis)[NB], - const rajaperf::Real_type tmpxz, - const rajaperf::Real_type xloctmpz, - const rajaperf::Real_type tmpxzloc, - const rajaperf::Real_type xzloc) +inline void dnodebasis_dy( + Real_array_ref dbasis, + const Real_type tmpxz, + const Real_type xloctmpz, + const Real_type tmpxzloc, + const Real_type xzloc) { dbasis[0] = -tmpxz; dbasis[1] = -xloctmpz; @@ -594,12 +561,12 @@ constexpr void dnodebasis_dy( } RAJA_HOST_DEVICE -constexpr void dnodebasis_dz( - rajaperf::Real_type (&dbasis)[NB], - const rajaperf::Real_type tmpxy, - const rajaperf::Real_type xloctmpy, - const rajaperf::Real_type xyloc, - const rajaperf::Real_type tmpxyloc) +inline void dnodebasis_dz( + Real_array_ref dbasis, + const Real_type tmpxy, + const Real_type xloctmpy, + const Real_type xyloc, + const Real_type tmpxyloc) { dbasis[0] = -tmpxy; dbasis[1] = -xloctmpy; @@ -612,22 +579,22 @@ constexpr void dnodebasis_dz( } RAJA_HOST_DEVICE -constexpr void transform_node_dbasis( - const rajaperf::Real_type jinvxx, - const rajaperf::Real_type jinvxy, - const rajaperf::Real_type jinvxz, - const rajaperf::Real_type jinvyx, - const rajaperf::Real_type jinvyy, - const rajaperf::Real_type jinvyz, - const rajaperf::Real_type jinvzx, - const rajaperf::Real_type jinvzy, - const rajaperf::Real_type jinvzz, - rajaperf::Real_type (&basisx)[NB], - rajaperf::Real_type (&basisy)[NB], - rajaperf::Real_type (&basisz)[NB], - rajaperf::Real_type (&tbasisx)[NB], - rajaperf::Real_type (&tbasisy)[NB], - rajaperf::Real_type (&tbasisz)[NB]) +inline void transform_node_dbasis( + const Real_type jinvxx, + const Real_type jinvxy, + const Real_type jinvxz, + const Real_type jinvyx, + const Real_type jinvyy, + const Real_type jinvyz, + const Real_type jinvzx, + const Real_type jinvzy, + const Real_type jinvzz, + Real_array_const_ref basisx, + Real_array_const_ref basisy, + Real_array_const_ref basisz, + Real_array_ref tbasisx, + Real_array_ref tbasisy, + Real_array_ref tbasisz) { // Transform is: Grad(w_i) <- J^{-1} Grad(w_i) transform_basis( @@ -642,12 +609,12 @@ constexpr void transform_node_dbasis( // Edge basis //----------------------------------------- RAJA_HOST_DEVICE -constexpr void edgebasis_x( - rajaperf::Real_type (&basisx)[EB], - const rajaperf::Real_type tmpyz, - const rajaperf::Real_type yloctmpz, - const rajaperf::Real_type tmpyzloc, - const rajaperf::Real_type yzloc) +inline void edgebasis_x( + Real_array_ref basisx, + const Real_type tmpyz, + const Real_type yloctmpz, + const Real_type tmpyzloc, + const Real_type yzloc) { basisx[0] = tmpyz; basisx[1] = yloctmpz; @@ -665,12 +632,12 @@ constexpr void edgebasis_x( // Evaluate basis with respect to y at this quadrature point RAJA_HOST_DEVICE -constexpr void edgebasis_y( - rajaperf::Real_type (&basisy)[EB], - const rajaperf::Real_type tmpxz, - const rajaperf::Real_type xloctmpz, - const rajaperf::Real_type tmpxzloc, - const rajaperf::Real_type xzloc) +inline void edgebasis_y( + Real_array_ref basisy, + const Real_type tmpxz, + const Real_type xloctmpz, + const Real_type tmpxzloc, + const Real_type xzloc) { basisy[0] = 0.0; basisy[1] = 0.0; @@ -688,12 +655,12 @@ constexpr void edgebasis_y( // Evaluate basis with respect to z at this quadrature point RAJA_HOST_DEVICE -constexpr void edgebasis_z( - rajaperf::Real_type (&basisz)[EB], - const rajaperf::Real_type tmpxy, - const rajaperf::Real_type xloctmpy, - const rajaperf::Real_type xyloc, - const rajaperf::Real_type tmpxyloc) +inline void edgebasis_z( + Real_array_ref basisz, + const Real_type tmpxy, + const Real_type xloctmpy, + const Real_type xyloc, + const Real_type tmpxyloc) { basisz[0] = 0.0; basisz[1] = 0.0; @@ -711,10 +678,10 @@ constexpr void edgebasis_z( // Differeniate basis with respect to x at this quadrature point RAJA_HOST_DEVICE -constexpr void curl_edgebasis_x( - rajaperf::Real_type (&dbasisx)[EB], - const rajaperf::Real_type tmpx, - const rajaperf::Real_type xpt) +inline void curl_edgebasis_x( + Real_array_ref dbasisx, + const Real_type tmpx, + const Real_type xpt) { dbasisx[0] = 0.0; // dbasisx[1] = 0.0; // @@ -732,10 +699,10 @@ constexpr void curl_edgebasis_x( // Differeniate basis with respect to y at this quadrature point RAJA_HOST_DEVICE -constexpr void curl_edgebasis_y( - rajaperf::Real_type (&dbasisy)[EB], - const rajaperf::Real_type tmpy, - const rajaperf::Real_type ypt) +inline void curl_edgebasis_y( + Real_array_ref dbasisy, + const Real_type tmpy, + const Real_type ypt) { dbasisy[0] = -tmpy; // -1*f2 dbasisy[1] = -ypt; // -1*f3 @@ -753,10 +720,10 @@ constexpr void curl_edgebasis_y( // Differeniate basis with respect to z at this quadrature point RAJA_HOST_DEVICE -constexpr void curl_edgebasis_z( - rajaperf::Real_type (&dbasisz)[EB], - const rajaperf::Real_type tmpz, - const rajaperf::Real_type zpt) +inline void curl_edgebasis_z( + Real_array_ref dbasisz, + const Real_type tmpz, + const Real_type zpt) { dbasisz[0] = tmpz; // +1*f4 dbasisz[1] = -tmpz; // -1*f4 @@ -773,58 +740,58 @@ constexpr void curl_edgebasis_z( } RAJA_HOST_DEVICE -constexpr void edgebasis( - const rajaperf::Real_type xloc, - const rajaperf::Real_type yloc, - const rajaperf::Real_type zloc, - rajaperf::Real_type (&ebasisx)[EB], - rajaperf::Real_type (&ebasisy)[EB], - rajaperf::Real_type (&ebasisz)[EB]) -{ - const rajaperf::Real_type tmpx = 1. - xloc; - const rajaperf::Real_type tmpy = 1. - yloc; - const rajaperf::Real_type tmpz = 1. - zloc; - - const rajaperf::Real_type tmpxy = tmpx*tmpy; - const rajaperf::Real_type xyloc = xloc*yloc; - const rajaperf::Real_type tmpxyloc = tmpx*yloc; - const rajaperf::Real_type xloctmpy = xloc*tmpy; - const rajaperf::Real_type tmpxz = tmpx*tmpz; - const rajaperf::Real_type tmpyz = tmpy*tmpz; - const rajaperf::Real_type xzloc = xloc*zloc; - const rajaperf::Real_type yzloc = yloc*zloc; - const rajaperf::Real_type tmpyzloc = tmpy*zloc; - const rajaperf::Real_type tmpxzloc = tmpx*zloc; - const rajaperf::Real_type yloctmpz = yloc*tmpz; - const rajaperf::Real_type xloctmpz = xloc*tmpz; +inline void edgebasis( + const Real_type xloc, + const Real_type yloc, + const Real_type zloc, + Real_array_ref ebasisx, + Real_array_ref ebasisy, + Real_array_ref ebasisz) +{ + const Real_type tmpx = 1. - xloc; + const Real_type tmpy = 1. - yloc; + const Real_type tmpz = 1. - zloc; + + const Real_type tmpxy = tmpx*tmpy; + const Real_type xyloc = xloc*yloc; + const Real_type tmpxyloc = tmpx*yloc; + const Real_type xloctmpy = xloc*tmpy; + const Real_type tmpxz = tmpx*tmpz; + const Real_type tmpyz = tmpy*tmpz; + const Real_type xzloc = xloc*zloc; + const Real_type yzloc = yloc*zloc; + const Real_type tmpyzloc = tmpy*zloc; + const Real_type tmpxzloc = tmpx*zloc; + const Real_type yloctmpz = yloc*tmpz; + const Real_type xloctmpz = xloc*tmpz; edgebasis_x(ebasisx, tmpyz, yloctmpz, tmpyzloc, yzloc); edgebasis_y(ebasisy, tmpxz, xloctmpz, tmpxzloc, xzloc); edgebasis_z(ebasisz, tmpxy, xloctmpy, xyloc, tmpxyloc); } -constexpr rajaperf::Int_type flops_transform_basis(int basis_size) +inline Int_type flops_transform_basis(int basis_size) { return 3*5*basis_size; } RAJA_HOST_DEVICE -constexpr void transform_edge_basis( - const rajaperf::Real_type jinvxx, - const rajaperf::Real_type jinvxy, - const rajaperf::Real_type jinvxz, - const rajaperf::Real_type jinvyx, - const rajaperf::Real_type jinvyy, - const rajaperf::Real_type jinvyz, - const rajaperf::Real_type jinvzx, - const rajaperf::Real_type jinvzy, - const rajaperf::Real_type jinvzz, - rajaperf::Real_type (&basisx)[EB], - rajaperf::Real_type (&basisy)[EB], - rajaperf::Real_type (&basisz)[EB], - rajaperf::Real_type (&tbasisx)[EB], - rajaperf::Real_type (&tbasisy)[EB], - rajaperf::Real_type (&tbasisz)[EB]) +inline void transform_edge_basis( + const Real_type jinvxx, + const Real_type jinvxy, + const Real_type jinvxz, + const Real_type jinvyx, + const Real_type jinvyy, + const Real_type jinvyz, + const Real_type jinvzx, + const Real_type jinvzy, + const Real_type jinvzz, + Real_array_const_ref basisx, + Real_array_const_ref basisy, + Real_array_const_ref basisz, + Real_array_ref tbasisx, + Real_array_ref tbasisy, + Real_array_ref tbasisz) { // Transform is: w_i <- J^{-1} w_i transform_basis( @@ -836,23 +803,23 @@ constexpr void transform_edge_basis( } RAJA_HOST_DEVICE -constexpr void transform_curl_edge_basis( - const rajaperf::Real_type jxx, - const rajaperf::Real_type jxy, - const rajaperf::Real_type jxz, - const rajaperf::Real_type jyx, - const rajaperf::Real_type jyy, - const rajaperf::Real_type jyz, - const rajaperf::Real_type jzx, - const rajaperf::Real_type jzy, - const rajaperf::Real_type jzz, - const rajaperf::Real_type invdetj, - rajaperf::Real_type (&basisx)[EB], - rajaperf::Real_type (&basisy)[EB], - rajaperf::Real_type (&basisz)[EB], - rajaperf::Real_type (&tbasisx)[EB], - rajaperf::Real_type (&tbasisy)[EB], - rajaperf::Real_type (&tbasisz)[EB]) +inline void transform_curl_edge_basis( + const Real_type jxx, + const Real_type jxy, + const Real_type jxz, + const Real_type jyx, + const Real_type jyy, + const Real_type jyz, + const Real_type jzx, + const Real_type jzy, + const Real_type jzz, + const Real_type invdetj, + Real_array_const_ref basisx, + Real_array_const_ref basisy, + Real_array_const_ref basisz, + Real_array_ref tbasisx, + Real_array_ref tbasisy, + Real_array_ref tbasisz) { // Transform is: Curl(w_i) <- (1/|J|)J^{T} Curl(w_i) transform_basis( @@ -867,10 +834,10 @@ constexpr void transform_curl_edge_basis( // Face basis //----------------------------------------- RAJA_HOST_DEVICE -constexpr void face_basis_x( - rajaperf::Real_type (&basisx)[FB], - const rajaperf::Real_type tmpx, - const rajaperf::Real_type xpt) +inline void face_basis_x( + Real_array_ref basisx, + const Real_type tmpx, + const Real_type xpt) { basisx[0] = tmpx; basisx[1] = xpt; @@ -881,10 +848,10 @@ constexpr void face_basis_x( } RAJA_HOST_DEVICE -constexpr void face_basis_y( - rajaperf::Real_type (&basisy)[FB], - const rajaperf::Real_type tmpy, - const rajaperf::Real_type ypt) +inline void face_basis_y( + Real_array_ref basisy, + const Real_type tmpy, + const Real_type ypt) { basisy[0] = 0.0; basisy[1] = 0.0; @@ -895,10 +862,10 @@ constexpr void face_basis_y( } RAJA_HOST_DEVICE -constexpr void face_basis_z( - rajaperf::Real_type (&basisz)[FB], - const rajaperf::Real_type tmpz, - const rajaperf::Real_type zpt) +inline void face_basis_z( + Real_array_ref basisz, + const Real_type tmpz, + const Real_type zpt) { basisz[0] = 0.0; basisz[1] = 0.0; @@ -909,23 +876,23 @@ constexpr void face_basis_z( } RAJA_HOST_DEVICE -constexpr void transform_face_basis( - const rajaperf::Real_type jxx, - const rajaperf::Real_type jxy, - const rajaperf::Real_type jxz, - const rajaperf::Real_type jyx, - const rajaperf::Real_type jyy, - const rajaperf::Real_type jyz, - const rajaperf::Real_type jzx, - const rajaperf::Real_type jzy, - const rajaperf::Real_type jzz, - const rajaperf::Real_type invdetj, - rajaperf::Real_type (&basisx)[FB], - rajaperf::Real_type (&basisy)[FB], - rajaperf::Real_type (&basisz)[FB], - rajaperf::Real_type (&tbasisx)[FB], - rajaperf::Real_type (&tbasisy)[FB], - rajaperf::Real_type (&tbasisz)[FB]) +inline void transform_face_basis( + const Real_type jxx, + const Real_type jxy, + const Real_type jxz, + const Real_type jyx, + const Real_type jyy, + const Real_type jyz, + const Real_type jzx, + const Real_type jzy, + const Real_type jzz, + const Real_type invdetj, + Real_array_const_ref basisx, + Real_array_const_ref basisy, + Real_array_const_ref basisz, + Real_array_ref tbasisx, + Real_array_ref tbasisy, + Real_array_ref tbasisz) { // Transform is: f_i <- (1/|J|)J^{T} f_i transform_basis( @@ -936,4 +903,6 @@ constexpr void transform_face_basis( tbasisx, tbasisy, tbasisz); } +} // namespace rajaperf + #endif // closing endif for header file include guard diff --git a/src/basic/ARRAY_OF_PTRS.cpp b/src/basic/ARRAY_OF_PTRS.cpp index 329eb0f4b..ecb9d1160 100644 --- a/src/basic/ARRAY_OF_PTRS.cpp +++ b/src/basic/ARRAY_OF_PTRS.cpp @@ -81,5 +81,47 @@ void ARRAY_OF_PTRS::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) deallocData(m_y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void ARRAY_OF_PTRS::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + ARRAY_OF_PTRS_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(ARRAY_OF_PTRS_BODY(x)); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/ARRAY_OF_PTRS.hpp b/src/basic/ARRAY_OF_PTRS.hpp index 9cbacc4f2..840493412 100644 --- a/src/basic/ARRAY_OF_PTRS.hpp +++ b/src/basic/ARRAY_OF_PTRS.hpp @@ -66,6 +66,7 @@ class ARRAY_OF_PTRS : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/basic/COPY8.cpp b/src/basic/COPY8.cpp index f474739fd..795b05951 100644 --- a/src/basic/COPY8.cpp +++ b/src/basic/COPY8.cpp @@ -114,5 +114,47 @@ void COPY8::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_y7, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void COPY8::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + COPY8_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(COPY8_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/COPY8.hpp b/src/basic/COPY8.hpp index 82c4f9057..b0de97dde 100644 --- a/src/basic/COPY8.hpp +++ b/src/basic/COPY8.hpp @@ -75,6 +75,7 @@ class COPY8 : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/basic/DAXPY.cpp b/src/basic/DAXPY.cpp index 4a69cb949..40c589b9e 100644 --- a/src/basic/DAXPY.cpp +++ b/src/basic/DAXPY.cpp @@ -80,5 +80,47 @@ void DAXPY::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void DAXPY::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + DAXPY_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(DAXPY_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/DAXPY.hpp b/src/basic/DAXPY.hpp index 43ae1f7ee..2f22e62e9 100644 --- a/src/basic/DAXPY.hpp +++ b/src/basic/DAXPY.hpp @@ -48,6 +48,7 @@ class DAXPY : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/basic/DAXPY_ATOMIC.cpp b/src/basic/DAXPY_ATOMIC.cpp index afee6b863..d83ddc803 100644 --- a/src/basic/DAXPY_ATOMIC.cpp +++ b/src/basic/DAXPY_ATOMIC.cpp @@ -81,5 +81,47 @@ void DAXPY_ATOMIC::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void DAXPY_ATOMIC::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + DAXPY_ATOMIC_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(RAJAPERF_ATOMIC_ADD_COUNTING(y[i], a * x[i]);); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/DAXPY_ATOMIC.hpp b/src/basic/DAXPY_ATOMIC.hpp index fc4bd9532..9acc64579 100644 --- a/src/basic/DAXPY_ATOMIC.hpp +++ b/src/basic/DAXPY_ATOMIC.hpp @@ -48,6 +48,7 @@ class DAXPY_ATOMIC : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/basic/EMPTY.cpp b/src/basic/EMPTY.cpp index 58eb07518..a1c105f86 100644 --- a/src/basic/EMPTY.cpp +++ b/src/basic/EMPTY.cpp @@ -73,5 +73,47 @@ void EMPTY::tearDown(VariantID RAJAPERF_UNUSED_ARG(vid), size_t RAJAPERF_UNUSED_ { } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void EMPTY::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + EMPTY_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(EMPTY_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/EMPTY.hpp b/src/basic/EMPTY.hpp index 10ad340ba..5a078060f 100644 --- a/src/basic/EMPTY.hpp +++ b/src/basic/EMPTY.hpp @@ -52,6 +52,7 @@ class EMPTY : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/basic/IF_QUAD.cpp b/src/basic/IF_QUAD.cpp index 329e2a4e6..f7e53f160 100644 --- a/src/basic/IF_QUAD.cpp +++ b/src/basic/IF_QUAD.cpp @@ -87,5 +87,47 @@ void IF_QUAD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_x2, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void IF_QUAD::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + IF_QUAD_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(IF_QUAD_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/IF_QUAD.hpp b/src/basic/IF_QUAD.hpp index af63dc5f2..e21f96132 100644 --- a/src/basic/IF_QUAD.hpp +++ b/src/basic/IF_QUAD.hpp @@ -65,6 +65,7 @@ class IF_QUAD : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp index e377f31c8..c301e951e 100644 --- a/src/basic/INDEXLIST.cpp +++ b/src/basic/INDEXLIST.cpp @@ -85,5 +85,57 @@ void INDEXLIST::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_list, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void INDEXLIST::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + INDEXLIST_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + Index_type count = 0; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(INDEXLIST_BODY); + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + m_len = count; + ); + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INDEXLIST.hpp b/src/basic/INDEXLIST.hpp index ae85c2fcc..ac6dbcd20 100644 --- a/src/basic/INDEXLIST.hpp +++ b/src/basic/INDEXLIST.hpp @@ -56,6 +56,7 @@ class INDEXLIST : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/basic/INDEXLIST_3LOOP.cpp b/src/basic/INDEXLIST_3LOOP.cpp index 13467d936..48839e2ea 100644 --- a/src/basic/INDEXLIST_3LOOP.cpp +++ b/src/basic/INDEXLIST_3LOOP.cpp @@ -88,5 +88,77 @@ void INDEXLIST_3LOOP::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_id deallocData(m_list, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void INDEXLIST_3LOOP::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + INDEXLIST_3LOOP_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + INDEXLIST_3LOOP_COUNTS_SETUP(DataSpace::Host); + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0); + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + Index_type count = 0; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend+1; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY( + Index_type inc = counts[i]; + counts[i] = count; + count += inc; + ); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(INDEXLIST_3LOOP_MAKE_LIST); + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + m_len = counts[iend]; + ); + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + INDEXLIST_3LOOP_COUNTS_TEARDOWN(DataSpace::Host); + ); + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INDEXLIST_3LOOP.hpp b/src/basic/INDEXLIST_3LOOP.hpp index ce7c8d7b4..7ba635904 100644 --- a/src/basic/INDEXLIST_3LOOP.hpp +++ b/src/basic/INDEXLIST_3LOOP.hpp @@ -74,6 +74,7 @@ class INDEXLIST_3LOOP : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/basic/INIT3.cpp b/src/basic/INIT3.cpp index 317fcee94..951f5128c 100644 --- a/src/basic/INIT3.cpp +++ b/src/basic/INIT3.cpp @@ -86,5 +86,47 @@ void INIT3::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_in2, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void INIT3::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + INIT3_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(INIT3_OPT_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INIT3.hpp b/src/basic/INIT3.hpp index 7ab94b844..49c309f68 100644 --- a/src/basic/INIT3.hpp +++ b/src/basic/INIT3.hpp @@ -29,6 +29,12 @@ #define INIT3_BODY \ out1[i] = out2[i] = out3[i] = - in1[i] - in2[i] ; +#define INIT3_OPT_BODY \ + Real_type tmp = - in1[i] - in2[i]; \ + out1[i] = tmp ; \ + out2[i] = tmp ; \ + out3[i] = tmp ; + #include "common/KernelBase.hpp" @@ -51,6 +57,7 @@ class INIT3 : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/basic/INIT_VIEW1D.cpp b/src/basic/INIT_VIEW1D.cpp index 337e65e3c..71ea54fe8 100644 --- a/src/basic/INIT_VIEW1D.cpp +++ b/src/basic/INIT_VIEW1D.cpp @@ -78,5 +78,47 @@ void INIT_VIEW1D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_a, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void INIT_VIEW1D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + INIT_VIEW1D_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(INIT_VIEW1D_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INIT_VIEW1D.hpp b/src/basic/INIT_VIEW1D.hpp index c59dbce18..6560a666a 100644 --- a/src/basic/INIT_VIEW1D.hpp +++ b/src/basic/INIT_VIEW1D.hpp @@ -62,6 +62,7 @@ class INIT_VIEW1D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/basic/INIT_VIEW1D_OFFSET.cpp b/src/basic/INIT_VIEW1D_OFFSET.cpp index 2d0ee5793..06daef02b 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET.cpp @@ -78,5 +78,47 @@ void INIT_VIEW1D_OFFSET::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune deallocData(m_a, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void INIT_VIEW1D_OFFSET::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 1; + const Index_type iend = getActualProblemSize()+1; + + INIT_VIEW1D_OFFSET_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(INIT_VIEW1D_OFFSET_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INIT_VIEW1D_OFFSET.hpp b/src/basic/INIT_VIEW1D_OFFSET.hpp index f517005e0..a3b9a772c 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.hpp +++ b/src/basic/INIT_VIEW1D_OFFSET.hpp @@ -61,6 +61,7 @@ class INIT_VIEW1D_OFFSET : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/basic/MAT_MAT_SHARED-Cuda.cpp b/src/basic/MAT_MAT_SHARED-Cuda.cpp index e58a1139a..c77b5937f 100644 --- a/src/basic/MAT_MAT_SHARED-Cuda.cpp +++ b/src/basic/MAT_MAT_SHARED-Cuda.cpp @@ -39,7 +39,9 @@ __global__ void mat_mat_shared(Index_type N, Real_ptr C, Real_ptr A, __syncthreads(); - MAT_MAT_SHARED_BODY_3(tile_size) + for (Index_type n = 0; n < tile_size; ++n) { + MAT_MAT_SHARED_BODY_3(tile_size) + } __syncthreads(); } @@ -132,7 +134,11 @@ void MAT_MAT_SHARED::runCudaVariantImpl(VariantID vid) __syncthreads(); auto inner_y_3 = [&](Index_type ty) { - auto inner_x_3 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_3(tile_size) }; + auto inner_x_3 = [&](Index_type tx) { + for (Index_type n = 0; n < tile_size; ++n) { + MAT_MAT_SHARED_BODY_3(tile_size) + } + }; { Index_type tx = threadIdx.x; @@ -246,7 +252,9 @@ void MAT_MAT_SHARED::runCudaVariantImpl(VariantID vid) [&](Index_type ty) { RAJA::loop(ctx, RAJA::RangeSegment(0, tile_size), [&](Index_type tx) { - MAT_MAT_SHARED_BODY_3(tile_size) + for (Index_type n = 0; n < tile_size; ++n) { + MAT_MAT_SHARED_BODY_3(tile_size) + } } ); // RAJA::loop } diff --git a/src/basic/MAT_MAT_SHARED-Hip.cpp b/src/basic/MAT_MAT_SHARED-Hip.cpp index f67c8f288..322200f3c 100644 --- a/src/basic/MAT_MAT_SHARED-Hip.cpp +++ b/src/basic/MAT_MAT_SHARED-Hip.cpp @@ -39,7 +39,9 @@ __global__ void mat_mat_shared(Index_type N, Real_ptr C, Real_ptr A, __syncthreads(); - MAT_MAT_SHARED_BODY_3(tile_size) + for (Index_type n = 0; n < tile_size; ++n) { + MAT_MAT_SHARED_BODY_3(tile_size) + } __syncthreads(); } @@ -132,7 +134,11 @@ void MAT_MAT_SHARED::runHipVariantImpl(VariantID vid) __syncthreads(); auto inner_y_3 = [&](Index_type ty) { - auto inner_x_3 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_3(tile_size) }; + auto inner_x_3 = [&](Index_type tx) { + for (Index_type n = 0; n < tile_size; ++n) { + MAT_MAT_SHARED_BODY_3(tile_size) + } + }; { Index_type tx = threadIdx.x; @@ -245,7 +251,9 @@ void MAT_MAT_SHARED::runHipVariantImpl(VariantID vid) [&](Index_type ty) { RAJA::loop(ctx, RAJA::RangeSegment(0, tile_size), [&](Index_type tx) { - MAT_MAT_SHARED_BODY_3(tile_size) + for (Index_type n = 0; n < tile_size; ++n) { + MAT_MAT_SHARED_BODY_3(tile_size) + } } ); // RAJA::loop } diff --git a/src/basic/MAT_MAT_SHARED-OMP.cpp b/src/basic/MAT_MAT_SHARED-OMP.cpp index d37ffe0cb..cc7395387 100644 --- a/src/basic/MAT_MAT_SHARED-OMP.cpp +++ b/src/basic/MAT_MAT_SHARED-OMP.cpp @@ -62,7 +62,9 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid) { for (Index_type ty = 0; ty < TL_SZ; ++ty) { for (Index_type tx = 0; tx < TL_SZ; ++tx) { - MAT_MAT_SHARED_BODY_3(TL_SZ) + for (Index_type n = 0; n < TL_SZ; ++n) { + MAT_MAT_SHARED_BODY_3(TL_SZ) + } } } } @@ -120,7 +122,11 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid) { } auto inner_y_3 = [&](Index_type ty) { - auto inner_x_3 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_3(TL_SZ) }; + auto inner_x_3 = [&](Index_type tx) { + for (Index_type n = 0; n < TL_SZ; ++n) { + MAT_MAT_SHARED_BODY_3(TL_SZ) + } + }; for (Index_type tx = 0; tx < TL_SZ; ++tx) { inner_x_3(tx); @@ -218,7 +224,9 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid) { [&](Index_type ty) { RAJA::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type tx) { - MAT_MAT_SHARED_BODY_3(TL_SZ) + for (Index_type n = 0; n < TL_SZ; ++n) { + MAT_MAT_SHARED_BODY_3(TL_SZ) + } } ); // RAJA::loop } diff --git a/src/basic/MAT_MAT_SHARED-Seq.cpp b/src/basic/MAT_MAT_SHARED-Seq.cpp index cf844c3a9..3a993263d 100644 --- a/src/basic/MAT_MAT_SHARED-Seq.cpp +++ b/src/basic/MAT_MAT_SHARED-Seq.cpp @@ -53,7 +53,9 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid) { for (Index_type ty = 0; ty < TL_SZ; ++ty) { for (Index_type tx = 0; tx < TL_SZ; ++tx) { - MAT_MAT_SHARED_BODY_3(TL_SZ) + for (Index_type n = 0; n < TL_SZ; ++n) { + MAT_MAT_SHARED_BODY_3(TL_SZ) + } } } @@ -114,7 +116,11 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid) { } auto inner_y_3 = [&](Index_type ty) { - auto inner_x_3 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_3(TL_SZ) }; + auto inner_x_3 = [&](Index_type tx) { + for (Index_type n = 0; n < TL_SZ; ++n) { + MAT_MAT_SHARED_BODY_3(TL_SZ) + } + }; for (Index_type tx = 0; tx < TL_SZ; ++tx) { inner_x_3(tx); @@ -127,7 +133,9 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid) { } auto inner_y_4 = [&](Index_type ty) { - auto inner_x_4 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_4(TL_SZ) }; + auto inner_x_4 = [&](Index_type tx) { + MAT_MAT_SHARED_BODY_4(TL_SZ) + }; for (Index_type tx = 0; tx < TL_SZ; ++tx) { inner_x_4(tx); @@ -213,7 +221,9 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid) { [&](Index_type ty) { RAJA::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type tx) { - MAT_MAT_SHARED_BODY_3(TL_SZ) + for (Index_type n = 0; n < TL_SZ; ++n) { + MAT_MAT_SHARED_BODY_3(TL_SZ) + } } ); // RAJA::loop } diff --git a/src/basic/MAT_MAT_SHARED-Sycl.cpp b/src/basic/MAT_MAT_SHARED-Sycl.cpp index 88e1f265f..3cb06a849 100644 --- a/src/basic/MAT_MAT_SHARED-Sycl.cpp +++ b/src/basic/MAT_MAT_SHARED-Sycl.cpp @@ -52,9 +52,9 @@ void MAT_MAT_SHARED::runSyclVariantImpl(VariantID vid) qu->submit([&](::sycl::handler& h) { - ::sycl::local_accessor As(::sycl::range<2>(tile_size, tile_size), h); - ::sycl::local_accessor Bs(::sycl::range<2>(tile_size, tile_size), h); - ::sycl::local_accessor Cs(::sycl::range<2>(tile_size, tile_size), h); + ::sycl::local_accessor As(::sycl::range<2>(tile_size, tile_size), h); + ::sycl::local_accessor Bs(::sycl::range<2>(tile_size, tile_size), h); + ::sycl::local_accessor Cs(::sycl::range<2>(tile_size, tile_size), h); h.parallel_for (::sycl::nd_range<3>(gridSize, workGroupSize), @@ -73,7 +73,9 @@ void MAT_MAT_SHARED::runSyclVariantImpl(VariantID vid) itm.barrier(::sycl::access::fence_space::local_space); - MAT_MAT_SHARED_BODY_3(tile_size) + for (Index_type n = 0; n < tile_size; ++n) { + MAT_MAT_SHARED_BODY_3(tile_size) + } itm.barrier(::sycl::access::fence_space::local_space); } @@ -93,7 +95,7 @@ void MAT_MAT_SHARED::runSyclVariantImpl(VariantID vid) constexpr bool async = true; const int local_mats = 3; - constexpr size_t shmem = tile_size * tile_size * local_mats * sizeof(double); + constexpr size_t shmem = tile_size * tile_size * local_mats * sizeof(Real_type); using launch_policy = RAJA::LaunchPolicy>; @@ -122,12 +124,12 @@ void MAT_MAT_SHARED::runSyclVariantImpl(VariantID vid) //We only support dynamic shared memory in Sycl //Thus requiring a different setup than other backends //which use static shared memory - double * As_ptr = ctx.getSharedMemory(tile_size * tile_size); - double * Bs_ptr = ctx.getSharedMemory(tile_size * tile_size); - double * Cs_ptr = ctx.getSharedMemory(tile_size * tile_size); - double (*As)[tile_size] = (double (*)[tile_size]) As_ptr; - double (*Bs)[tile_size] = (double (*)[tile_size]) Bs_ptr; - double (*Cs)[tile_size] = (double (*)[tile_size]) Cs_ptr; + Real_type * As_ptr = ctx.getSharedMemory(tile_size * tile_size); + Real_type * Bs_ptr = ctx.getSharedMemory(tile_size * tile_size); + Real_type * Cs_ptr = ctx.getSharedMemory(tile_size * tile_size); + Real_type (*As)[tile_size] = (Real_type (*)[tile_size]) As_ptr; + Real_type (*Bs)[tile_size] = (Real_type (*)[tile_size]) Bs_ptr; + Real_type (*Cs)[tile_size] = (Real_type (*)[tile_size]) Cs_ptr; RAJA::loop(ctx, RAJA::RangeSegment(0, tile_size), [&](Index_type ty) { @@ -158,7 +160,9 @@ void MAT_MAT_SHARED::runSyclVariantImpl(VariantID vid) [&](Index_type ty) { RAJA::loop(ctx, RAJA::RangeSegment(0, tile_size), [&](Index_type tx) { - MAT_MAT_SHARED_BODY_3(tile_size) + for (Index_type n = 0; n < tile_size; ++n) { + MAT_MAT_SHARED_BODY_3(tile_size) + } } ); // RAJA::loop } diff --git a/src/basic/MAT_MAT_SHARED.cpp b/src/basic/MAT_MAT_SHARED.cpp index f6837c11c..21df11866 100644 --- a/src/basic/MAT_MAT_SHARED.cpp +++ b/src/basic/MAT_MAT_SHARED.cpp @@ -81,5 +81,88 @@ void MAT_MAT_SHARED::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx deallocData(m_C, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void MAT_MAT_SHARED::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type N = m_N; + + MAT_MAT_SHARED_DATA_SETUP; + const Index_type Nx = RAJA_DIVIDE_CEILING_INT(N, TL_SZ); + const Index_type Ny = RAJA_DIVIDE_CEILING_INT(N, TL_SZ); + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type by = 0; by < Ny; ++by)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type bx = 0; bx < Nx; ++bx)) { + RAJAPERF_COUNTERS_TEAM_CONTEXT(); + + //Work around for when compiling with CLANG and HIP + //See notes in MAT_MAT_SHARED.hpp + RAJAPERF_COUNTERS_LOOP_BODY(MAT_MAT_SHARED_BODY_0_CLANG_HIP_CPU(TL_SZ)); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type ty = 0; ty < TL_SZ; ++ty)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type tx = 0; tx < TL_SZ; ++tx)) { + RAJAPERF_COUNTERS_LOOP_BODY(MAT_MAT_SHARED_BODY_1(TL_SZ)); + } + } + + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type k = 0; k < (TL_SZ + N - 1) / TL_SZ; ++k)) { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type ty = 0; ty < TL_SZ; ++ty)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type tx = 0; tx < TL_SZ; ++tx)) { + RAJAPERF_COUNTERS_LOOP_BODY(MAT_MAT_SHARED_BODY_2(TL_SZ)); + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type ty = 0; ty < TL_SZ; ++ty)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type tx = 0; tx < TL_SZ; ++tx)) { + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type n = 0; n < TL_SZ; ++n)) { + RAJAPERF_COUNTERS_LOOP_BODY(MAT_MAT_SHARED_BODY_3(TL_SZ)); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + } // Sequential loop + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type ty = 0; ty < TL_SZ; ++ty)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type tx = 0; tx < TL_SZ; ++tx)) { + RAJAPERF_COUNTERS_LOOP_BODY(MAT_MAT_SHARED_BODY_4(TL_SZ)); + } + } + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/MAT_MAT_SHARED.hpp b/src/basic/MAT_MAT_SHARED.hpp index 5613167c7..fc638fe65 100644 --- a/src/basic/MAT_MAT_SHARED.hpp +++ b/src/basic/MAT_MAT_SHARED.hpp @@ -86,14 +86,14 @@ constexpr rajaperf::Index_type TL_SZ = 16; so it doesn't see these kind of problems. */ #define MAT_MAT_SHARED_BODY_0_CLANG_HIP_CPU(tile_size) \ - Real_type As[tile_size][tile_size]; \ - Real_type Bs[tile_size][tile_size]; \ - Real_type Cs[tile_size][tile_size]; + Real_array2 As; \ + Real_array2 Bs; \ + Real_array2 Cs; #define MAT_MAT_SHARED_BODY_0(tile_size) \ - RAJA_TEAM_SHARED Real_type As[tile_size][tile_size]; \ - RAJA_TEAM_SHARED Real_type Bs[tile_size][tile_size]; \ - RAJA_TEAM_SHARED Real_type Cs[tile_size][tile_size]; + RAJA_TEAM_SHARED Real_array2 As; \ + RAJA_TEAM_SHARED Real_array2 Bs; \ + RAJA_TEAM_SHARED Real_array2 Cs; #define MAT_MAT_SHARED_BODY_1(tile_size) \ Cs[ty][tx] = 0; @@ -111,7 +111,6 @@ constexpr rajaperf::Index_type TL_SZ = 16; Bs[ty][tx] = 0.0; #define MAT_MAT_SHARED_BODY_3(tile_size) \ - for (Index_type n = 0; n < tile_size; ++n) \ Cs[ty][tx] += As[ty][n] * Bs[n][tx]; #define MAT_MAT_SHARED_BODY_4(tile_size) \ @@ -135,6 +134,7 @@ class MAT_MAT_SHARED : public KernelBase { void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/basic/MULADDSUB.cpp b/src/basic/MULADDSUB.cpp index 9fd6617cb..d3dd81cff 100644 --- a/src/basic/MULADDSUB.cpp +++ b/src/basic/MULADDSUB.cpp @@ -86,5 +86,47 @@ void MULADDSUB::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_in2, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void MULADDSUB::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + MULADDSUB_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(MULADDSUB_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/MULADDSUB.hpp b/src/basic/MULADDSUB.hpp index 8f5e584e8..4374f6340 100644 --- a/src/basic/MULADDSUB.hpp +++ b/src/basic/MULADDSUB.hpp @@ -54,6 +54,7 @@ class MULADDSUB : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/basic/MULTI_REDUCE.cpp b/src/basic/MULTI_REDUCE.cpp index 34dca6b6f..55f20c1de 100644 --- a/src/basic/MULTI_REDUCE.cpp +++ b/src/basic/MULTI_REDUCE.cpp @@ -150,5 +150,65 @@ void MULTI_REDUCE::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(DataSpace::Host, m_values_final); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void MULTI_REDUCE::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + MULTI_REDUCE_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + MULTI_REDUCE_SETUP_VALUES; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + MULTI_REDUCE_INIT_VALUES; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(MULTI_REDUCE_BODY(RAJAPERF_ATOMIC_ADD_COUNTING)); + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + MULTI_REDUCE_FINALIZE_VALUES; + ); + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + MULTI_REDUCE_TEARDOWN_VALUES; + ); + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/MULTI_REDUCE.hpp b/src/basic/MULTI_REDUCE.hpp index 71353bba6..73f9b2b8d 100644 --- a/src/basic/MULTI_REDUCE.hpp +++ b/src/basic/MULTI_REDUCE.hpp @@ -91,6 +91,7 @@ class MULTI_REDUCE : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/basic/NESTED_INIT.cpp b/src/basic/NESTED_INIT.cpp index 601febd49..8b8782b3b 100644 --- a/src/basic/NESTED_INIT.cpp +++ b/src/basic/NESTED_INIT.cpp @@ -87,5 +87,48 @@ void NESTED_INIT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_array, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void NESTED_INIT::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + NESTED_INIT_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type k = 0; k < nk; ++k )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < nj; ++j )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < ni; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(NESTED_INIT_BODY); + } + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/NESTED_INIT.hpp b/src/basic/NESTED_INIT.hpp index 8f716465e..ff06c58ca 100644 --- a/src/basic/NESTED_INIT.hpp +++ b/src/basic/NESTED_INIT.hpp @@ -54,6 +54,7 @@ class NESTED_INIT : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/basic/PI_ATOMIC.cpp b/src/basic/PI_ATOMIC.cpp index 73cbc0ec9..255ba86fa 100644 --- a/src/basic/PI_ATOMIC.cpp +++ b/src/basic/PI_ATOMIC.cpp @@ -79,5 +79,60 @@ void PI_ATOMIC::tearDown(VariantID RAJAPERF_UNUSED_ARG(vid), size_t RAJAPERF_UNU } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void PI_ATOMIC::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + PI_ATOMIC_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + *pi = m_pi_init; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_type x = (Real_type(i) + 0.5) * dx; + RAJAPERF_ATOMIC_ADD_COUNTING(*pi, dx / (1.0 + x * x)); + ); + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + m_pi_final = *pi * 4.0; + ); + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/PI_ATOMIC.hpp b/src/basic/PI_ATOMIC.hpp index c62454fe0..09a3bc9b0 100644 --- a/src/basic/PI_ATOMIC.hpp +++ b/src/basic/PI_ATOMIC.hpp @@ -61,6 +61,7 @@ class PI_ATOMIC : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/basic/PI_REDUCE.cpp b/src/basic/PI_REDUCE.cpp index 7a37eb60a..1ac059ce6 100644 --- a/src/basic/PI_REDUCE.cpp +++ b/src/basic/PI_REDUCE.cpp @@ -79,5 +79,57 @@ void PI_REDUCE::tearDown(VariantID RAJAPERF_UNUSED_ARG(vid), size_t RAJAPERF_UNU } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void PI_REDUCE::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + PI_REDUCE_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + Real_type pi = m_pi_init; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(PI_REDUCE_BODY); + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + m_pi = 4.0 * pi; + ); + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/PI_REDUCE.hpp b/src/basic/PI_REDUCE.hpp index 4072a745e..2ee6c0439 100644 --- a/src/basic/PI_REDUCE.hpp +++ b/src/basic/PI_REDUCE.hpp @@ -52,6 +52,7 @@ class PI_REDUCE : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineOpenMPTargetVariantTunings(); void defineSeqVariantTunings(); diff --git a/src/basic/REDUCE3_INT.cpp b/src/basic/REDUCE3_INT.cpp index 35aa7f529..ce1f2ef1e 100644 --- a/src/basic/REDUCE3_INT.cpp +++ b/src/basic/REDUCE3_INT.cpp @@ -91,5 +91,61 @@ void REDUCE3_INT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_vec, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void REDUCE3_INT::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + REDUCE3_INT_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + Int_type vsum = m_vsum_init; + Int_type vmin = m_vmin_init; + Int_type vmax = m_vmax_init; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(REDUCE3_INT_BODY); + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + m_vsum = vsum; + m_vmin = vmin; + m_vmax = vmax; + ); + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE3_INT.hpp b/src/basic/REDUCE3_INT.hpp index 22f148a84..c22735507 100644 --- a/src/basic/REDUCE3_INT.hpp +++ b/src/basic/REDUCE3_INT.hpp @@ -66,6 +66,7 @@ class REDUCE3_INT : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineOpenMPTargetVariantTunings(); void defineKokkosVariantTunings(); diff --git a/src/basic/REDUCE_STRUCT.cpp b/src/basic/REDUCE_STRUCT.cpp index b4f040323..4970e868f 100644 --- a/src/basic/REDUCE_STRUCT.cpp +++ b/src/basic/REDUCE_STRUCT.cpp @@ -102,5 +102,64 @@ void REDUCE_STRUCT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) deallocData(m_y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void REDUCE_STRUCT::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + REDUCE_STRUCT_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + Real_type xsum = m_init_sum; Real_type ysum = m_init_sum; + Real_type xmin = m_init_min; Real_type ymin = m_init_min; + Real_type xmax = m_init_max; Real_type ymax = m_init_max; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(REDUCE_STRUCT_BODY); + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + points.SetCenter(xsum/(points.N), ysum/(points.N)); + points.SetXMin(xmin); + points.SetXMax(xmax); + points.SetYMin(ymin); + points.SetYMax(ymax); + m_points = points; + ); + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE_STRUCT.hpp b/src/basic/REDUCE_STRUCT.hpp index 7555e24a2..44d5aa6fa 100644 --- a/src/basic/REDUCE_STRUCT.hpp +++ b/src/basic/REDUCE_STRUCT.hpp @@ -82,6 +82,7 @@ class REDUCE_STRUCT : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineOpenMPTargetVariantTunings(); void defineSeqVariantTunings(); diff --git a/src/basic/TRAP_INT-func.hpp b/src/basic/TRAP_INT-func.hpp index fdf281b3b..486bbb87c 100644 --- a/src/basic/TRAP_INT-func.hpp +++ b/src/basic/TRAP_INT-func.hpp @@ -29,6 +29,20 @@ Real_type trap_int_func(Real_type x, denom = 1.0/sqrt(denom); return denom; } +/// +RAJA_INLINE +RAJA_HOST_DEVICE +Real_type trap_int_opt_func(Real_type x, + Real_type y, + Real_type xp, + Real_type yp) +{ + Real_type xmxp = x - xp; + Real_type ymyp = y - yp; + Real_type denom = xmxp*xmxp + ymyp*ymyp; + denom = 1.0/sqrt(denom); + return denom; +} } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/TRAP_INT.cpp b/src/basic/TRAP_INT.cpp index 3fa9d7317..3c8f9d569 100644 --- a/src/basic/TRAP_INT.cpp +++ b/src/basic/TRAP_INT.cpp @@ -89,5 +89,69 @@ void TRAP_INT::tearDown(VariantID RAJAPERF_UNUSED_ARG(vid), size_t RAJAPERF_UNUS } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + + +} // end namespace basic +} // end namespace rajaperf + +// This shouldn't result in ODR violations as the argument types have changed +#include "TRAP_INT-func.hpp" + +namespace rajaperf +{ +namespace basic +{ + +void TRAP_INT::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + TRAP_INT_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + Real_type sumx = m_sumx_init; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(TRAP_INT_OPT_BODY); + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + m_sumx += sumx * h; + ); + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/TRAP_INT.hpp b/src/basic/TRAP_INT.hpp index c7cb86115..570d275e4 100644 --- a/src/basic/TRAP_INT.hpp +++ b/src/basic/TRAP_INT.hpp @@ -41,6 +41,10 @@ Real_type x = x0 + i*h; \ sumx += trap_int_func(x, y, xp, yp); +#define TRAP_INT_OPT_BODY \ + Real_type x = x0 + i*h; \ + sumx += trap_int_opt_func(x, y, xp, yp); + #include "common/KernelBase.hpp" @@ -63,6 +67,7 @@ class TRAP_INT : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineOpenMPTargetVariantTunings(); void defineKokkosVariantTunings(); diff --git a/src/comm/HALO_PACKING.cpp b/src/comm/HALO_PACKING.cpp index e2937ef0c..126d0a93e 100644 --- a/src/comm/HALO_PACKING.cpp +++ b/src/comm/HALO_PACKING.cpp @@ -114,5 +114,99 @@ void HALO_PACKING::tearDown(VariantID vid, size_t tune_idx) tearDown_base(vid, tune_idx); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void HALO_PACKING::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + HALO_PACKING_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_OUTER_LOOP(for (Index_type l = 0; l < num_neighbors; ++l)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + ); + RAJAPERF_COUNTERS_OUTER_LOOP(for (Index_type v = 0; v < num_vars; ++v)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr var = vars[v]; + ); + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < len; i++)) { + RAJAPERF_COUNTERS_LOOP_BODY(HALO_PACK_BODY); + } + RAJAPERF_COUNTERS_LOOP_BODY( + buffer += len; + ); + } + + RAJAPERF_COUNTERS_IF(if (separate_buffers)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr send_buffer = send_buffers[l]; + ); + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < len*num_vars; i++)) { + RAJAPERF_COUNTERS_LOOP_BODY(send_buffer[i] = buffer[i]); + } + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + } + + RAJAPERF_COUNTERS_OUTER_LOOP(for (Index_type l = 0; l < num_neighbors; ++l)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + ); + RAJAPERF_COUNTERS_IF(if (separate_buffers)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr recv_buffer = recv_buffers[l]; + ); + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < len*num_vars; i++)) { + RAJAPERF_COUNTERS_LOOP_BODY(buffer[i] = recv_buffer[i]); + } + } + + RAJAPERF_COUNTERS_OUTER_LOOP(for (Index_type v = 0; v < num_vars; ++v)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr var = vars[v]; + ); + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < len; i++)) { + RAJAPERF_COUNTERS_LOOP_BODY(HALO_UNPACK_BODY); + } + RAJAPERF_COUNTERS_LOOP_BODY( + buffer += len; + ); + } + } + RAJAPERF_COUNTERS_PAR_SYNC(); + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, 0); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace comm } // end namespace rajaperf diff --git a/src/comm/HALO_PACKING.hpp b/src/comm/HALO_PACKING.hpp index 7b4c41149..1d8b0145a 100644 --- a/src/comm/HALO_PACKING.hpp +++ b/src/comm/HALO_PACKING.hpp @@ -82,6 +82,7 @@ class HALO_PACKING : public HALO_base void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/comm/HALO_PACKING_FUSED.cpp b/src/comm/HALO_PACKING_FUSED.cpp index b3b883ed1..359e33f72 100644 --- a/src/comm/HALO_PACKING_FUSED.cpp +++ b/src/comm/HALO_PACKING_FUSED.cpp @@ -114,5 +114,137 @@ void HALO_PACKING_FUSED::tearDown(VariantID vid, size_t tune_idx) tearDown_base(vid, tune_idx); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void HALO_PACKING_FUSED::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + HALO_PACKING_FUSED_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + HALO_PACKING_FUSED_MANUAL_FUSER_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + Index_type pack_index = 0; + ); + + RAJAPERF_COUNTERS_OUTER_LOOP(for (Index_type l = 0; l < num_neighbors; ++l)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + ); + RAJAPERF_COUNTERS_OUTER_LOOP(for (Index_type v = 0; v < num_vars; ++v)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr var = vars[v]; + pack_ptr_holders[pack_index] = ptr_holder{buffer, list, var}; + pack_lens[pack_index] = len; + pack_index += 1; + buffer += len; + ); + } + } + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < pack_index; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY( + ptr_holder pack_ptrs = pack_ptr_holders[j]; + Real_ptr buffer = pack_ptrs.buffer; + Int_ptr list = pack_ptrs.list; + Real_ptr var = pack_ptrs.var; + Index_type len = pack_lens[j]; + ); + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < len; i++)) { + RAJAPERF_COUNTERS_LOOP_BODY(HALO_PACK_BODY); + } + } + RAJAPERF_COUNTERS_IF(if (separate_buffers)) { + RAJAPERF_COUNTERS_OUTER_LOOP(for (Index_type l = 0; l < num_neighbors; ++l)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Index_type len = pack_index_list_lengths[l]; + Real_ptr send_buffer = send_buffers[l]; + Real_ptr buffer = pack_buffers[l]; + ); + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < len*num_vars; i++)) { + RAJAPERF_COUNTERS_LOOP_BODY(send_buffer[i] = buffer[i]); + } + } + } + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + Index_type unpack_index = 0; + ); + + RAJAPERF_COUNTERS_OUTER_LOOP(for (Index_type l = 0; l < num_neighbors; ++l)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + ); + RAJAPERF_COUNTERS_IF(if (separate_buffers)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr recv_buffer = recv_buffers[l]; + ); + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < len*num_vars; i++)) { + RAJAPERF_COUNTERS_LOOP_BODY(buffer[i] = recv_buffer[i]); + } + } + + RAJAPERF_COUNTERS_OUTER_LOOP(for (Index_type v = 0; v < num_vars; ++v)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr var = vars[v]; + unpack_ptr_holders[unpack_index] = ptr_holder{buffer, list, var}; + unpack_lens[unpack_index] = len; + unpack_index += 1; + buffer += len; + ); + } + } + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < unpack_index; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY( + ptr_holder unpack_ptrs = unpack_ptr_holders[j]; + Real_ptr buffer = unpack_ptrs.buffer; + Int_ptr list = unpack_ptrs.list; + Real_ptr var = unpack_ptrs.var; + Index_type len = unpack_lens[j]; + ); + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < len; i++)) { + RAJAPERF_COUNTERS_LOOP_BODY(HALO_UNPACK_BODY); + } + } + RAJAPERF_COUNTERS_PAR_SYNC(); + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN; + ); + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, 0); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace comm } // end namespace rajaperf diff --git a/src/comm/HALO_PACKING_FUSED.hpp b/src/comm/HALO_PACKING_FUSED.hpp index 875d0da75..ec9d48b52 100644 --- a/src/comm/HALO_PACKING_FUSED.hpp +++ b/src/comm/HALO_PACKING_FUSED.hpp @@ -59,9 +59,9 @@ Real_ptr_ptr recv_buffers = m_recv_buffers; #define HALO_PACKING_FUSED_MANUAL_FUSER_SETUP \ - ptr_holder* pack_ptr_holders = nullptr; \ + RAJAPERF_WRAPPER(ptr_holder*) pack_ptr_holders = nullptr; \ Index_ptr pack_lens = nullptr; \ - ptr_holder* unpack_ptr_holders = nullptr; \ + RAJAPERF_WRAPPER(ptr_holder*) unpack_ptr_holders = nullptr; \ Index_ptr unpack_lens = nullptr; \ allocData(DataSpace::Host, pack_ptr_holders, num_neighbors * num_vars); \ allocData(DataSpace::Host, pack_lens, num_neighbors * num_vars); \ @@ -132,6 +132,7 @@ class HALO_PACKING_FUSED : public HALO_base void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/comm/HALO_base.hpp b/src/comm/HALO_base.hpp index e9e3f464b..411628d25 100644 --- a/src/comm/HALO_base.hpp +++ b/src/comm/HALO_base.hpp @@ -128,7 +128,7 @@ class HALO_base : public KernelBase Index_type k_max; }; - static const int s_num_neighbors = 26; + static inline constexpr int s_num_neighbors = 26; static const int s_boundary_offsets[s_num_neighbors][3]; static Index_type s_grid_dims_default[3]; diff --git a/src/common/CountingData.hpp b/src/common/CountingData.hpp new file mode 100644 index 000000000..27fff3f7b --- /dev/null +++ b/src/common/CountingData.hpp @@ -0,0 +1,1294 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-25, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJAPerf_CountingData_HPP +#define RAJAPerf_CountingData_HPP + +#include "common/RAJAPerfSuite.hpp" +#include "common/RPTypes.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace rajaperf +{ + +namespace counting +{ + +enum struct OpType : int +{ + fp64, + int32, + int64, + ptr, + other, + NumOpTypes // must be at the end of the valid values +}; + +template < typename T > +constexpr OpType getOpType() +{ + using decayed_T = std::decay_t; + if constexpr (std::is_floating_point_v && sizeof(decayed_T) == sizeof(double)) { + return OpType::fp64; + } else if constexpr (std::is_integral_v && sizeof(decayed_T) == sizeof(std::int32_t)) { + return OpType::int32; + } else if constexpr (std::is_integral_v && sizeof(decayed_T) == sizeof(std::int64_t)) { + return OpType::int64; + } else if constexpr (std::is_pointer_v) { + return OpType::ptr; + } else { + return OpType::other; + } +} + +constexpr const char* getOpTypeName(OpType ot) +{ + switch (ot) { + case OpType::int32: return "int32"; + case OpType::int64: return "int64"; + case OpType::ptr: return "ptr"; + case OpType::fp64: return "fp64"; + case OpType::other: return "other"; + default: throw std::invalid_argument("ot is not in OpType"); + } +} + +template < typename T > +const char* get_type_name() +{ + OpType ot = getOpType(); + if (ot != OpType::other) { + return getOpTypeName(ot); + } else { + return typeid(T).name(); + } +} + +enum struct Operation : int +{ + copy, + assign, + load, + store, + uplus, + uminus, + abs, + add, + sub, + mult, + div, + rem, + preinc, + predec, + postinc, + postdec, + atomic_add, + sqrt, + exp, + bit_not, + bit_and, + bit_or, + bit_xor, + bit_lsh, + bit_rsh, + eq, + ne, + lt, + le, + gt, + ge, + NumOperations, // must be at the end of the valid values + FLOP_begin = add, // used when counting what counts as a flop + FLOP_end = eq // used when counting what counts as a flop +}; + +constexpr const char* getOperationName(Operation op) +{ + switch (op) { + case Operation::copy: return "copy"; + case Operation::assign: return "assign"; + case Operation::load: return "load"; + case Operation::store: return "store"; + case Operation::uplus: return "uplus"; + case Operation::uminus: return "uminus"; + case Operation::abs: return "abs"; + case Operation::add: return "add"; + case Operation::sub: return "sub"; + case Operation::mult: return "mult"; + case Operation::div: return "div"; + case Operation::rem: return "rem"; + case Operation::preinc: return "preinc"; + case Operation::predec: return "predec"; + case Operation::postinc: return "postinc"; + case Operation::postdec: return "postdec"; + case Operation::atomic_add: return "atomic_add"; + case Operation::sqrt: return "sqrt"; + case Operation::exp: return "exp"; + case Operation::bit_not: return "bit_not"; + case Operation::bit_and: return "bit_and"; + case Operation::bit_or: return "bit_or"; + case Operation::bit_xor: return "bit_xor"; + case Operation::bit_lsh: return "bit_lsh"; + case Operation::bit_rsh: return "bit_rsh"; + case Operation::eq: return "eq"; + case Operation::ne: return "ne"; + case Operation::lt: return "lt"; + case Operation::le: return "le"; + case Operation::gt: return "gt"; + case Operation::ge: return "ge"; + default: throw std::invalid_argument("op is not in Operation"); + } +} + +enum struct ContextType : int +{ + exterior, + outer, + repetition, + cond, + outer_loop, + seq_loop, + par_loop, + team, + body, + par_sync, + team_sync, + NumContextTypes // must be at the end of the valid values +}; + +constexpr const char* getContextTypeName(ContextType ct) +{ + switch (ct) { + case ContextType::exterior: return "exterior"; + case ContextType::outer: return "outer"; + case ContextType::repetition: return "repetition"; + case ContextType::cond: return "cond"; + case ContextType::outer_loop: return "outer_loop"; + case ContextType::seq_loop: return "seq_loop"; + case ContextType::par_loop: return "par_loop"; + case ContextType::team: return "team"; + case ContextType::body: return "body"; + case ContextType::par_sync: return "par_sync"; + case ContextType::team_sync: return "team_sync"; + default: throw std::invalid_argument("Unknown ContextType"); + } +} + +enum struct MemoryAccess : int +{ + read, + write, + atomicModifyWrite, + NumMemoryAccesses // must be at the end of the valid values +}; + +constexpr const char* getMemoryAccessName(MemoryAccess ma) +{ + switch (ma) { + case MemoryAccess::read: return "read"; + case MemoryAccess::write: return "write"; + case MemoryAccess::atomicModifyWrite: return "atomicModifyWrite"; + default: throw std::invalid_argument("Unknown MemoryAccess"); + } +} + +constexpr const char* getMemoryAccessNamePastTense(MemoryAccess ma) +{ + switch (ma) { + case MemoryAccess::read: return "read"; + case MemoryAccess::write: return "written"; + case MemoryAccess::atomicModifyWrite: return "atomicModifyWritten"; + default: throw std::invalid_argument("Unknown MemoryAccess"); + } +} + +constexpr const char* getMemoryAccessNamePastTenseTitle(MemoryAccess ma) +{ + switch (ma) { + case MemoryAccess::read: return "Read"; + case MemoryAccess::write: return "Written"; + case MemoryAccess::atomicModifyWrite: return "AtomicModifyWritten"; + default: throw std::invalid_argument("Unknown MemoryAccess"); + } +} + +enum struct AllocationGroup : int +{ + global, + team, + NumAllocationGroups // must be at the end of the valid values +}; + +constexpr const char* getAllocationGroupName(AllocationGroup ma) +{ + switch (ma) { + case AllocationGroup::global: return "global"; + case AllocationGroup::team: return "team"; + default: throw std::invalid_argument("Unknown AllocationGroup"); + } +} + +// Must be in order innermost to outermost, so loop must be before rep, etc. +enum struct CountingPoint : int +{ + team, + loop, + rep, + NumCountingPoints // must be at the end of the valid values +}; + +constexpr const char* getCountingPointName(CountingPoint ma) +{ + switch (ma) { + case CountingPoint::team: return "team"; + case CountingPoint::loop: return "loop"; + case CountingPoint::rep: return "rep"; + default: throw std::invalid_argument("Unknown CountingPoint"); + } +} + + +constexpr std::string get_spacing(Size_type depth) +{ + return std::string(depth*2, ' '); +} + +struct MemoryCounts +{ + Size_type touched = 0; + Size_type accessed[Size_type(MemoryAccess::NumMemoryAccesses)] = {0}; + + void add(MemoryCounts const& other_counts, Size_type multiplier = 1) + { + touched += other_counts.touched * multiplier; + for (Size_type a = 0; a < Size_type(MemoryAccess::NumMemoryAccesses); ++a) { + accessed[a] += other_counts.accessed[a] * multiplier; + } + } +}; + +struct AddressTouches +{ + std::vector address_accessed[Size_type(MemoryAccess::NumMemoryAccesses)]; + + AddressTouches() = default; + + explicit AddressTouches(Size_type size, bool value = false) + { + resize(size, value); + } + + void resize(Size_type size, bool value = false) + { + for (Size_type a = 0; a < Size_type(MemoryAccess::NumMemoryAccesses); ++a) { + address_accessed[a].resize(size, value); + } + } + + Size_type size() const + { + return address_accessed[0].size(); + } + + void set_all(Size_type size, bool value) + { + for (Size_type a = 0; a < Size_type(MemoryAccess::NumMemoryAccesses); ++a) { + for (Size_type i = 0; i < size; ++i) { + address_accessed[a][i] = value; + } + } + } + + void count(Size_type size, + MemoryCounts& address_counts) const + { + for (Size_type i = 0; i < size; ++i) { + bool addr_touched = false; + for (Size_type a = 0; a < Size_type(MemoryAccess::NumMemoryAccesses); ++a) { + bool addr_accessed = address_accessed[a][i]; + addr_touched = addr_touched || addr_accessed; + address_counts.accessed[a] += addr_accessed ? 1 : 0; + } + address_counts.touched += addr_touched ? 1 : 0; + } + } + + void combine(Size_type size, + AddressTouches const& other_touches) + { + for (Size_type a = 0; a < Size_type(MemoryAccess::NumMemoryAccesses); ++a) { + for (Size_type i = 0; i < size; ++i) { + address_accessed[a][i] = other_touches.address_accessed[a][i] || address_accessed[a][i]; + } + } + } + + void clear() + { + for (Size_type a = 0; a < Size_type(MemoryAccess::NumMemoryAccesses); ++a) { + address_accessed[a].clear(); + address_accessed[a].shrink_to_fit(); + } + } +}; + +struct TouchCounts +{ + Size_type m_size = 0; + MemoryCounts total_counts; + MemoryCounts address_counts[Size_type(CountingPoint::NumCountingPoints)]; + AddressTouches address_touches[Size_type(CountingPoint::NumCountingPoints)]; + + TouchCounts() = default; + + TouchCounts(CountingPoint point, Size_type size) + { + resize(point, size); + } + + void resize(CountingPoint point, Size_type size) + { + for (Size_type p = Size_type(point); + p < Size_type(CountingPoint::NumCountingPoints); ++p) { + address_touches[p].resize(size); + } + m_size = size; + } + + Size_type size() const + { + return m_size; + } + + void set_all_accesses(CountingPoint point, bool value) + { + address_touches[Size_type(point)].set_all(m_size, value); + } + + void touch(CountingPoint point, MemoryAccess access, Size_type offset, + Size_type num_ops) + { + if (point < CountingPoint::NumCountingPoints) { + total_counts.touched += num_ops; + total_counts.accessed[Size_type(access)] += num_ops; + address_touches[Size_type(point)].address_accessed[Size_type(access)].at(offset) = true; + } + } + + void count(CountingPoint point) + { + address_touches[Size_type(point)].count(m_size, address_counts[Size_type(point)]); + } + + void combine_accesses(CountingPoint point, + TouchCounts const& other_touches, + CountingPoint other_point) + { + address_touches[Size_type(point)].combine( + m_size, other_touches.address_touches[Size_type(other_point)]); + } + + void clear_accesses(CountingPoint point) + { + address_touches[Size_type(point)].clear(); + } +}; + + +struct AllocationMetadata +{ + Index_type idx = std::numeric_limits::min(); + const void* ptr_ptr = nullptr; + std::source_location allocate_location; + AllocationGroup group; + + void* ptr = nullptr; + + std::string pointed_to_type_name; + Size_type element_size = 0; + Size_type size = 0; + + TouchCounts counts; + + AllocationMetadata(Index_type idx_, const void* ptr_ptr_, + std::source_location location, AllocationGroup group_, + std::string pointed_to_type_name_, void* ptr_, + Size_type size_, Size_type element_size_) + : idx(idx_) + , ptr_ptr(ptr_ptr_) + , allocate_location(location) + , group(group_) + , ptr(ptr_) + , pointed_to_type_name(std::move(pointed_to_type_name_)) + , element_size(element_size_) + , size(size_) + , counts(CountingPoint(0), size_) + { + } + + void allocate(void* ptr_) + { + ptr = ptr_; + } + + void deallocate() + { + ptr = nullptr; + } + + void print_allocation(std::ostream& str, Size_type depth) const + { + std::string spacing = get_spacing(depth); + spacing += "// "; + str << spacing << pointed_to_type_name << "* allocation_" << idx + << " = " << getAllocationGroupName(group) << "_malloc(" + << size << " * " << element_size << ");\n"; + } + + void print_deallocation(std::ostream& str, Size_type depth) const + { + std::string spacing = get_spacing(depth); + spacing += "// "; + str << spacing << getAllocationGroupName(group) << "_free(" + << "allocation_" << idx << ");\n"; + } +}; + +struct Context +{ + Index_type idx = -1; + Size_type hit_count = 0; + ContextType type = ContextType::NumContextTypes; + const char* text = nullptr; + CountingPoint point = CountingPoint::NumCountingPoints; + Index_type point_depth = 0; + + Context* parent = nullptr; + // children are stored in order of increasing idx + std::vector> children; + std::vector child_idcs; + + Size_type operation_counters[Size_type(OpType::NumOpTypes)][Size_type(Operation::NumOperations)] = {{0}}; + + std::vector aloc_counts; + + MemoryCounts aloc_total_bytes; + MemoryCounts aloc_totals_bytes[Size_type(CountingPoint::NumCountingPoints)]; + + std::vector allocation_indices; + std::vector deallocation_indices; + + static constexpr CountingPoint get_point(Context* parent, ContextType type) + { + CountingPoint point = CountingPoint::NumCountingPoints; + if (type == ContextType::repetition) { + point = CountingPoint::rep; + } else if (type == ContextType::par_loop) { + point = CountingPoint::loop; + } else if (type == ContextType::team) { + point = CountingPoint::team; + } + if (parent) { + point = std::min(parent->point, point); + } + return point; + } + + // depth of 0 indicates this does not have a valid point + // depth of 1 indicates this is the first context with this point + // depths greater than 1 are children of of a context of this point + static constexpr Index_type get_depth(Context* parent, CountingPoint point) + { + Index_type depth = 0; + if (parent) { + if (point != parent->point) { + depth = 1; + } else if (parent->point_depth > 0) { + depth = parent->point_depth + 1; + } + } + return depth; + } + + Context(Index_type idx_, Context* parent_, ContextType type_, const char* text_, + std::vector> const& allocations) + : idx(idx_) + , type(type_) + , text(text_) + , point(get_point(parent_, type_)) + , point_depth(get_depth(parent_, get_point(parent_, type_))) + , parent(parent_) + , aloc_counts(allocations.size()) + { + if (type == ContextType::par_sync) { + if (point != CountingPoint::rep) { + throw std::runtime_error("par_sync must be in a repetition context"); + } + } else if (type == ContextType::team_sync) { + if (point != CountingPoint::team) { + throw std::runtime_error("team_sync must be in a team context"); + } + } + for (Size_type i = 0; i < allocations.size(); ++i) { + auto const& item = allocations[i]; + aloc_counts[i].resize(point, item->size); + } + } + + void update_allocations(std::vector> const& allocations) + { + for (Size_type i = 0; i < allocations.size(); ++i) { + auto const& item = allocations[i]; + if (i < aloc_counts.size()) { + if (item->size != aloc_counts[i].size()) { + throw std::runtime_error("Allocation record changed since last update"); + } + } else { + aloc_counts.resize(i+1); + aloc_counts[i].resize(point, item->size); + } + } + + for (auto& child_ptr : children) { + child_ptr->update_allocations(allocations); + } + } + + void add_allocation(AllocationMetadata const& item) + { + auto iter = std::ranges::find(allocation_indices, item.idx); + if (iter == allocation_indices.end()) { + allocation_indices.emplace_back(item.idx); + } + } + + void remove_allocation(AllocationMetadata const& item) + { + auto iter = std::ranges::find(deallocation_indices, item.idx); + if (iter == deallocation_indices.end()) { + deallocation_indices.emplace_back(item.idx); + } + } + + template < typename... Args > + Context* get_or_emplace_child(Index_type idx, Args&&... args) + { + using std::distance; + auto idx_iter = std::ranges::lower_bound(child_idcs, idx, std::ranges::less{}); + Size_type offset = distance(child_idcs.begin(), idx_iter); + auto iter = children.begin() + offset; + if (idx_iter == child_idcs.end() || *idx_iter != idx) { + idx_iter = child_idcs.emplace(idx_iter, idx); + iter = children.emplace(iter, std::make_unique(idx, this, std::forward(args)...)); + } + return iter->get(); + } + + void count_totals(AllocationMetadata& item) + { + aloc_total_bytes.add(aloc_counts[item.idx].total_counts, item.element_size); + item.counts.total_counts.add(aloc_counts[item.idx].total_counts, item.element_size); + + for (Size_type p = 0; p < Size_type(CountingPoint::NumCountingPoints); ++p) { + aloc_totals_bytes[p].add(aloc_counts[item.idx].address_counts[p], item.element_size); + } + } + + void clear() + { + for (Size_type i = 0; i < aloc_counts.size(); ++i) { + + for (Size_type p = 0; p < Size_type(CountingPoint::NumCountingPoints); ++p) { + aloc_counts[i].clear_accesses(CountingPoint(p)); + } + + } + } + + + void print_header(std::ostream& str, Size_type depth) const + { + std::string spacing = get_spacing(depth); + spacing += "// "; + + str << spacing << "Line " << idx << " hit " << hit_count << " times\n"; + } + + void print_allocations(std::ostream& str, Size_type depth, + std::vector> const& allocations) const + { + for (Index_type const& allocation_idx : allocation_indices) { + allocations[allocation_idx]->print_allocation(str, depth); + } + for (Index_type const& allocation_idx : deallocation_indices) { + allocations[allocation_idx]->print_deallocation(str, depth); + } + } + + void print_allocation_counts(std::ostream& str, Size_type depth, + std::string_view name, + MemoryCounts const& mem_counts) const + { + std::string spacing = get_spacing(depth); + spacing += "// "; + + if (mem_counts.touched) { + str << spacing + << name + << " touched " + << mem_counts.touched << "\n"; + } + + for (Size_type a = 0; a < Size_type(MemoryAccess::NumMemoryAccesses); ++a) { + + if (mem_counts.accessed[a]) { + str << spacing + << name + << " " << getMemoryAccessNamePastTense(MemoryAccess(a)) << " " + << mem_counts.accessed[a] << "\n"; + } + + } + } + + void print_counters(std::ostream& str, Size_type depth) const + { + std::string spacing = get_spacing(depth); + spacing += "// "; + + for (Size_type ot = 0; ot < Size_type(OpType::NumOpTypes); ++ot) { + + std::string opTypeName = getOpTypeName(OpType(ot)); + + for (Size_type op = 0; op < Size_type(Operation::NumOperations); ++op) { + + std::string opName = getOperationName(Operation(op)); + + Size_type num_ops = operation_counters[ot][op]; + + if (num_ops > 0) { + str << spacing << opTypeName << " " << opName << " " << num_ops << "\n"; + } + } + } + + print_allocation_counts(str, depth, "bytes", aloc_total_bytes); + + for (Size_type p = 0; p < Size_type(CountingPoint::NumCountingPoints); ++p) { + + std::string name = std::format("by {} bytes", + getCountingPointName(CountingPoint(p))); + + print_allocation_counts(str, depth, name, aloc_totals_bytes[p]); + + } + + for (Size_type i = 0; i < aloc_counts.size(); ++i) { + + for (Size_type p = 0; p < Size_type(CountingPoint::NumCountingPoints); ++p) { + + std::string name = std::format("by {} allocation_{} elements", + getCountingPointName(CountingPoint(p)), i); + + print_allocation_counts(str, depth, + name, aloc_counts[i].address_counts[p]); + + } + + } + + } + + std::string replace_values(std::string str, + std::vector const& wrapper_formats) const + { + for (const char* wrapper_format : wrapper_formats) { + std::regex re(std::vformat(wrapper_format, std::make_format_args("(.*?)"))); + str = std::regex_replace(str, re, "$1"); + } + + return str; + } + + void print_text(std::ostream& str, Size_type depth, + std::vector const& wrapper_formats) const + { + if (text == nullptr) return; + + std::string spacing = get_spacing(depth); + + std::string new_text = replace_values(text, wrapper_formats); + + std::string_view tv = new_text; + + if (!tv.empty()) { + + Size_type pos = 0; + while (pos < tv.size()) { + + // skip spacing between lines and extra semicolons + if (std::isspace(tv[pos]) || + tv[pos] == ';') { + ++pos; + continue; + } + + Size_type end = tv.find(';', pos); + if (end < tv.size()) { + end += 1; + } else { + end = tv.size(); + } + + str << spacing << tv.substr(pos, end-pos) << "\n"; + + pos = end; + } + } + } + + void print(std::ostream& str, Size_type depth, std::string_view tv) const + { + std::string spacing = get_spacing(depth); + + str << spacing << tv << "\n"; + } +}; + +struct CountingData; + +struct ScopedContext +{ + Context* context; + CountingData* countingData; + + ScopedContext(CountingData* countingData_, Context* context_) + : context(context_) + , countingData(countingData_) + { + } + + ScopedContext() = delete; + ScopedContext(ScopedContext const&) = delete; + ScopedContext(ScopedContext &&) = delete; + ScopedContext& operator=(ScopedContext const&) = delete; + ScopedContext& operator=(ScopedContext &&) = delete; + + ~ScopedContext() + { + pop_context(); + } + + void release() + { + countingData = nullptr; + context = nullptr; + } + + inline void pop_context(); +}; + +struct CountingData +{ + static inline Context* current_context = nullptr; + static inline CountingData* current_data = nullptr; + + Size_type par_it_per_rep_counter = 0; + Size_type all_it_per_rep_counter = 0; + + Size_type max_par_loop_depth = 0; + Size_type max_all_loop_depth = 0; + + Size_type kernel_per_rep_counter = 0; + Size_type par_sync_per_rep_counter = 0; + Size_type team_sync_per_rep_counter = 0; + + + Size_type memory_allocations[Size_type(AllocationGroup::NumAllocationGroups)] = {0}; + Size_type memory_bytes[Size_type(AllocationGroup::NumAllocationGroups)] = {0}; + + MemoryCounts memory_total_bytes[Size_type(AllocationGroup::NumAllocationGroups)]; + MemoryCounts memory_totals_bytes[Size_type(CountingPoint::NumCountingPoints)][Size_type(AllocationGroup::NumAllocationGroups)]; + + std::vector> allocations; + + + Size_type operation_counters[Size_type(OpType::NumOpTypes)][Size_type(Operation::NumOperations)] = {{0}}; + + + std::unique_ptr counter_context; + + + std::vector wrapper_formats; + + + void set_formats(std::initializer_list wrapper_formats) + { + for (const char* wrapper_format : wrapper_formats) { + this->wrapper_formats.emplace_back(wrapper_format); + } + } + + + AllocationMetadata* get_allocation(const void* ptr) + { + if (!ptr) { + return nullptr; + } + auto iter = std::ranges::find_if(allocations, + [&](std::unique_ptr const& item) { + if (!item->ptr) { return false; } + const char* allocation_begin = static_cast(item->ptr); + const char* allocation_end = allocation_begin + item->size*item->element_size; + return (allocation_begin <= static_cast(ptr) && + allocation_end > static_cast(ptr)); + }); + if (iter == allocations.end()) { + return nullptr; + } + return iter->get(); + } + /// + AllocationMetadata* get_allocation(const void* ptr_ptr, std::source_location location) + { + auto iter = std::ranges::find(allocations, + std::make_tuple(ptr_ptr, location.line(), location.column()), + [](std::unique_ptr const& item) { + return std::make_tuple(item->ptr_ptr, + item->allocate_location.line(), + item->allocate_location.column()); + }); + if (iter == allocations.end()) { + return nullptr; + } + return iter->get(); + } + + void add_allocation_impl(std::string pointed_to_type_name, AllocationGroup group, void* ptr, + Size_type size, Size_type element_size, + const void* ptr_ptr, std::source_location location) + { + auto item = get_allocation(ptr); + if (item) { + throw std::runtime_error("Allocation with this pointer already registered"); + } + item = get_allocation(ptr_ptr, location); + if (item) { + if (pointed_to_type_name != item->pointed_to_type_name || + size != item->size || + element_size != item->element_size) { + throw std::runtime_error("Allocation at this location changed type, size, or element_size"); + } + item->allocate(ptr); + } else { + item = allocations.emplace_back( + std::make_unique( + allocations.size(), ptr_ptr, location, group, + std::move(pointed_to_type_name), ptr, size, element_size)).get(); + counter_context->update_allocations(allocations); + current_context->add_allocation(*item); + } + } + + void add_allocation(std::string pointed_to_type_name, void* ptr, + Size_type size, Size_type element_size, + const void* ptr_ptr, std::source_location location) + { + add_allocation_impl(std::move(pointed_to_type_name), AllocationGroup::global, + ptr, size, element_size, + ptr_ptr, location); + } + + void add_team_allocation(std::string pointed_to_type_name, void* ptr, + Size_type size, Size_type element_size, + const void* ptr_ptr, std::source_location location) + { + add_allocation_impl(std::move(pointed_to_type_name), AllocationGroup::team, + ptr, size, element_size, + ptr_ptr, location); + } + + void remove_allocation(void* ptr, + [[maybe_unused]] const void* ptr_ptr, + [[maybe_unused]] std::source_location location = std::source_location::current()) + { + auto item = get_allocation(ptr); + if (!item) { + throw std::runtime_error("Allocation with this pointer not registered"); + } + item->deallocate(); + current_context->remove_allocation(*item); + } + + + ScopedContext create_context(const char* text, + std::source_location location = std::source_location::current()) + { + if (counter_context) { + throw std::runtime_error("Already created exterior context"); + } + + counter_context = std::make_unique( + location.line(), nullptr, ContextType::exterior, text, allocations); + + current_data = this; + current_context = counter_context.get(); + + current_context->hit_count += 1; + + return {this, current_context}; + } + + void push_context(ContextType type, const char* text, + std::source_location location = std::source_location::current()) + { + if (!current_data) { + throw std::runtime_error("Current data not set"); + } + if (!current_context) { + throw std::runtime_error("Current context not set"); + } + current_context = current_context->get_or_emplace_child( + location.line(), type, text, allocations); + current_context->hit_count += 1; + } + + ScopedContext push_outer_context(const char* text, + std::source_location location = std::source_location::current()) + { + push_context(ContextType::outer, text, location); + return {this, current_context}; + } + + ScopedContext push_rep_context(const char* text, + std::source_location location = std::source_location::current()) + { + push_context(ContextType::repetition, text, location); + return {this, current_context}; + } + + ScopedContext push_cond_context(const char* text, + std::source_location location = std::source_location::current()) + { + push_context(ContextType::cond, text, location); + return {this, current_context}; + } + + ScopedContext push_outer_loop_context(const char* text, + std::source_location location = std::source_location::current()) + { + push_context(ContextType::outer_loop, text, location); + return {this, current_context}; + } + + ScopedContext push_seq_loop_context(const char* text, + std::source_location location = std::source_location::current()) + { + push_context(ContextType::seq_loop, text, location); + return {this, current_context}; + } + + ScopedContext push_par_loop_context(const char* text, + std::source_location location = std::source_location::current()) + { + push_context(ContextType::par_loop, text, location); + return {this, current_context}; + } + + ScopedContext push_body_context(const char* text, + std::source_location location = std::source_location::current()) + { + push_context(ContextType::body, text, location); + return {this, current_context}; + } + + ScopedContext push_team_context(const char* text, + std::source_location location = std::source_location::current()) + { + push_context(ContextType::team, text, location); + return {this, current_context}; + } + + void add_par_sync(const char* text, + std::source_location location = std::source_location::current()) + { + push_context(ContextType::par_sync, text, location); + pop_context(); + } + + void add_team_sync(const char* text, + std::source_location location = std::source_location::current()) + { + push_context(ContextType::team_sync, text, location); + pop_context(); + } + + void pop_context() + { + if (!current_context) { + throw std::runtime_error("No context to pop"); + } + if (current_context->point_depth == 1) { + CountingPoint src_point = current_context->point; + CountingPoint dst_point = current_context->parent + ? current_context->parent->point + : src_point; + count_touches(current_context, src_point, dst_point, 0); + } + + current_context = current_context->parent; + } + + + + void finalize_context([[maybe_unused]] std::source_location location) + { + if (!counter_context) throw std::runtime_error("Exterior context not created"); + if (!current_context) throw std::runtime_error("No current context"); + if (current_context != counter_context.get()) throw std::runtime_error("Not at outer context"); + current_context = nullptr; + current_data = nullptr; + + count_totals(counter_context.get(), 0); + + // count stats for allocations + for (auto& item : allocations) { + + Size_type g = Size_type(item->group); + + memory_allocations[g] += 1; + memory_bytes[g] += item->size * item->element_size; + + memory_total_bytes[g].add(item->counts.total_counts, item->element_size); + + for (Size_type p = 0; p < Size_type(CountingPoint::NumCountingPoints); ++p) { + + memory_totals_bytes[p][g].add(item->counts.address_counts[p], item->element_size); + + item->counts.clear_accesses(CountingPoint(p)); + } + } + + count_kernels_and_iterations(counter_context.get()); + + count_operations(counter_context.get()); + } + + void count_totals(Context* context, Size_type depth) + { + for (auto& child_ptr : context->children) { + count_totals(child_ptr.get(), depth+1); + } + + for (auto& item : allocations) { + context->count_totals(*item); + } + context->clear(); + } + + void count_touches(Context* context, CountingPoint src_point, + CountingPoint dst_point, Size_type depth) + { + for (auto& child_ptr : context->children) { + count_touches(child_ptr.get(), src_point, dst_point, depth+1); + } + + for (auto& item : allocations) { + + auto& src_counts = context->aloc_counts[item->idx]; + + item->counts.combine_accesses(src_point, src_counts, src_point); + + if (dst_point < CountingPoint::NumCountingPoints && + dst_point != src_point) { + + context->aloc_counts[item->idx].combine_accesses( + dst_point, src_counts, src_point); + + item->counts.combine_accesses(dst_point, src_counts, src_point); + + } + + src_counts.count(src_point); + + src_counts.set_all_accesses(src_point, false); + + if (depth == 0) { + + item->counts.count(src_point); + + item->counts.set_all_accesses(src_point, false); + } + } + + } + + std::array count_kernels_and_iterations( + Context* context, + Size_type par_loop_stack_depth = 0, + Size_type all_loop_stack_depth = 0) + { + if (!context->parent) { + par_it_per_rep_counter = 0; + all_it_per_rep_counter = 0; + max_par_loop_depth = 0; + max_all_loop_depth = 0; + kernel_per_rep_counter = 0; + par_sync_per_rep_counter = 0; + team_sync_per_rep_counter = 0; + } + + if (context->type == ContextType::par_loop) { + par_loop_stack_depth += 1; + all_loop_stack_depth += 1; + max_par_loop_depth = std::max(par_loop_stack_depth, max_par_loop_depth); + max_all_loop_depth = std::max(all_loop_stack_depth, max_all_loop_depth); + } else if (context->type == ContextType::seq_loop) { + all_loop_stack_depth += 1; + max_all_loop_depth = std::max(all_loop_stack_depth, max_all_loop_depth); + } + + Size_type max_child_par_iterations = 0; + Size_type all_child_par_iterations = 0; + Size_type max_child_iterations = 0; + Size_type all_loop_iterations = 0; + + for (auto& child_ptr : context->children) { + + auto [par_iter, all_iter] = + count_kernels_and_iterations(child_ptr.get(), + par_loop_stack_depth, + all_loop_stack_depth); + + max_child_par_iterations = std::max(par_iter, max_child_par_iterations); + all_child_par_iterations += par_iter; + max_child_iterations = std::max(child_ptr->hit_count, max_child_iterations); + all_loop_iterations += all_iter; + + } + + Size_type child_par_iterations = all_child_par_iterations; + Size_type child_all_iterations = all_loop_iterations; + if (context->type == ContextType::seq_loop) { + child_all_iterations = std::max(all_loop_iterations, max_child_iterations); + } + + if (context->type == ContextType::team_sync) { + team_sync_per_rep_counter += context->hit_count; + } else if (context->type == ContextType::par_sync) { + par_sync_per_rep_counter += context->hit_count; + } + + if (Size_type(context->point) <= Size_type(CountingPoint::loop)) { + + if (context->point == CountingPoint::loop && context->point_depth == 1) { + kernel_per_rep_counter += context->hit_count; + } + + child_par_iterations = max_child_par_iterations; + if (context->type == ContextType::par_loop) { + child_par_iterations = std::max(max_child_par_iterations, max_child_iterations); + child_all_iterations = std::max(all_loop_iterations, max_child_iterations); + } + + } + + if (context->point == CountingPoint::rep && context->point_depth == 1) { + par_it_per_rep_counter = all_child_par_iterations; + all_it_per_rep_counter = all_loop_iterations; + } + + return {{child_par_iterations, child_all_iterations}}; + + } + + void count_operations(Context* context) + { + for (auto& child_ptr : context->children) { + count_operations(child_ptr.get()); + } + + if (Size_type(context->point) > Size_type(CountingPoint::rep)) { + return; // don't count operations outside of the repetition + } + + for (Size_type ot = 0; ot < Size_type(OpType::NumOpTypes); ++ot) { + for (Size_type op = 0; op < Size_type(Operation::NumOperations); ++op) { + operation_counters[ot][op] += context->operation_counters[ot][op]; + } + } + } + + void print_context(std::ostream& str, Context const& context, Size_type depth) const + { + context.print_header(str, depth+1); + + context.print_allocations(str, depth+1, allocations); + + context.print_counters(str, depth+1); + + context.print_text(str, depth+1, wrapper_formats); + + if (!context.children.empty()) { + + context.print(str, depth+1, "{"); + + for (auto const& child_ptr : context.children) { + print_context(str, *child_ptr.get(), depth+1); + } + + context.print(str, depth+1, "}"); + } + } + + void print(std::ostream& str) const + { + Context const& context = *counter_context.get(); + Size_type depth = 0; + context.print(str, depth, "{"); + print_context(str, context, depth); + context.print(str, depth, "}"); + } +}; + +inline void ScopedContext::pop_context() +{ + if (context) { + if (CountingData::current_context != context) { + throw std::runtime_error("ScopedContext popped in wrong context"); + } + if (CountingData::current_data != countingData) { + throw std::runtime_error("ScopedContext popped in wrong context"); + } + CountingData::current_data->pop_context(); + release(); + } +} + +} // closing brace for counting namespace + +} // closing brace for rajaperf namespace + +#endif // closing endif for header file include guard diff --git a/src/common/CountingMacros.hpp b/src/common/CountingMacros.hpp new file mode 100644 index 000000000..37ab06ff6 --- /dev/null +++ b/src/common/CountingMacros.hpp @@ -0,0 +1,143 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-25, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJAPerf_CountingMacros_HPP +#define RAJAPerf_CountingMacros_HPP + +// Note that using this should change the signature of functions but +// can cause ODR violations if it does not + +// Use this wrapper type in variable declarations in a kernel +// ex. +// RAJAPERF_WRAPPER(Real_type) val = ptr[i]; +// Note do not use it if declaring variables with constant values +#ifdef RAJAPERF_WRAPPER +#undef RAJAPERF_WRAPPER +#endif +#define RAJAPERF_WRAPPER(type) counting::Wrapper +#define RAJAPERF_ARRAY1_WRAPPER(type_name) typename counting::Array1WrapperHelper::template type +#define RAJAPERF_ARRAY2_WRAPPER(type_name) typename counting::Array2WrapperHelper::template type +#define RAJAPERF_ARRAY3_WRAPPER(type_name) typename counting::Array3WrapperHelper::template type +#define RAJAPERF_ARRAY4_WRAPPER(type_name) typename counting::Array4WrapperHelper::template type + +#define RAJAPERF_ATOMIC_ADD_COUNTING(lhs, rhs) \ + (lhs).atomic_add(rhs); + + +#define RAJAPERF_COUNTERS_INITIALIZE() \ + auto _exterior_context = this->initializeCounters({ \ + RAJAPERF_STRINGIFY(RAJAPERF_WRAPPER({0})), \ + RAJAPERF_STRINGIFY(RAJAPERF_ARRAY1_WRAPPER({0})), \ + RAJAPERF_STRINGIFY(RAJAPERF_ARRAY2_WRAPPER({0})), \ + RAJAPERF_STRINGIFY(RAJAPERF_ARRAY3_WRAPPER({0}))}); + +#define RAJAPERF_COUNTERS_CODE_WRAPPER(...) \ + auto RAJAPERF_NAME_PER_LINE(_code_context_) = \ + counting::CountingData::current_data-> \ + push_outer_context(RAJAPERF_STRINGIFY(__VA_ARGS__)); \ + __VA_ARGS__; \ + RAJAPERF_NAME_PER_LINE(_code_context_).pop_context() + +#define RAJAPERF_COUNTERS_REP_SCOPE() \ + if constexpr (auto _rep_context = \ + counting::CountingData::current_data->push_rep_context( \ + "for (RepIndex_type irep = 0; irep < run_reps; irep = irep + 1)"); \ + false) {} else + +#define RAJAPERF_COUNTERS_IF(...) \ + if constexpr (auto RAJAPERF_NAME_PER_LINE(_cond_context_) = \ + counting::CountingData::current_data->push_cond_context( \ + RAJAPERF_STRINGIFY(__VA_ARGS__)); false) {} else \ + __VA_ARGS__ + +#define RAJAPERF_COUNTERS_ELSE_IF(...) \ + else if constexpr (auto RAJAPERF_NAME_PER_LINE(_cond_context_) = \ + counting::CountingData::current_data->push_cond_context( \ + "else " RAJAPERF_STRINGIFY(__VA_ARGS__)); false) {} else \ + __VA_ARGS__ + +#define RAJAPERF_COUNTERS_ELSE() \ + else if constexpr (auto RAJAPERF_NAME_PER_LINE(_cond_context_) = \ + counting::CountingData::current_data->push_cond_context( \ + "else"); false) {} else + +// Note the main practical difference between this and SEQ_LOOP +// is that only SEQ_LOOP counts iterations +#define RAJAPERF_COUNTERS_OUTER_LOOP(...) \ + if constexpr (auto RAJAPERF_NAME_PER_LINE(_loop_context_) = \ + counting::CountingData::current_data->push_outer_loop_context( \ + RAJAPERF_STRINGIFY(__VA_ARGS__)); false) {} else \ + __VA_ARGS__ + +#define RAJAPERF_COUNTERS_SEQ_LOOP(...) \ + if constexpr (auto RAJAPERF_NAME_PER_LINE(_loop_context_) = \ + counting::CountingData::current_data->push_seq_loop_context( \ + RAJAPERF_STRINGIFY(__VA_ARGS__)); false) {} else \ + __VA_ARGS__ + +#define RAJAPERF_COUNTERS_PAR_LOOP(...) \ + if constexpr (auto RAJAPERF_NAME_PER_LINE(_loop_context_) = \ + counting::CountingData::current_data->push_par_loop_context( \ + RAJAPERF_STRINGIFY(__VA_ARGS__)); false) {} else \ + __VA_ARGS__ + +#define RAJAPERF_COUNTERS_LOOP_BODY(...) \ + auto RAJAPERF_NAME_PER_LINE(_body_context_) = \ + counting::CountingData::current_data->push_body_context( \ + RAJAPERF_STRINGIFY(__VA_ARGS__)); \ + __VA_ARGS__; \ + RAJAPERF_NAME_PER_LINE(_body_context_).pop_context() + +#define RAJAPERF_COUNTERS_TEAM_CONTEXT() \ + auto RAJAPERF_NAME_PER_LINE(_team_context_) = \ + counting::CountingData::current_data->push_team_context(""); + +#define RAJAPERF_COUNTERS_PAR_ALG(...) \ + auto RAJAPERF_NAME_PER_LINE(_alg_context_) = \ + counting::CountingData::current_data->push_par_loop_context( \ + RAJAPERF_STRINGIFY(__VA_ARGS__)); \ + __VA_ARGS__; \ + RAJAPERF_NAME_PER_LINE(_alg_context_).pop_context() + + +#define RAJAPERF_COUNTERS_PAR_SYNC() \ + counting::CountingData::current_data->add_par_sync("synchronize();") + +#define RAJAPERF_COUNTERS_TEAM_SYNC() \ + counting::CountingData::current_data->add_team_sync("synchronize();") + +#define RAJAPERF_COUNTERS_FINALIZE() \ + this->finalizeCounters(_exterior_context) + + +// Wrap rajaperf data types after implementing everything +#define Index_type RAJAPERF_WRAPPER(Index_type) +#define Index_ptr RAJAPERF_WRAPPER(Index_ptr) +#define Index_ptr_ptr RAJAPERF_WRAPPER(Index_ptr_ptr) +#define Size_type RAJAPERF_WRAPPER(Size_type) +#define Int_type RAJAPERF_WRAPPER(Int_type) +#define Int_ptr RAJAPERF_WRAPPER(Int_ptr) +#define Int_ptr_ptr RAJAPERF_WRAPPER(Int_ptr_ptr) +#define Real_type RAJAPERF_WRAPPER(Real_type) +#define Real_array RAJAPERF_ARRAY1_WRAPPER(Real_array) +#define Real_array2 RAJAPERF_ARRAY2_WRAPPER(Real_array2) +#define Real_array3 RAJAPERF_ARRAY3_WRAPPER(Real_array3) +#define Real_array4 RAJAPERF_ARRAY4_WRAPPER(Real_array4) +#define Real_array_ref RAJAPERF_ARRAY1_WRAPPER(Real_array_ref) +#define Real_array_const_ref RAJAPERF_ARRAY1_WRAPPER(Real_array_const_ref) +#define Real_array2_ref RAJAPERF_ARRAY2_WRAPPER(Real_array2_ref) +#define Real_array3_ref RAJAPERF_ARRAY3_WRAPPER(Real_array3_ref) +#define Real_array4_ref RAJAPERF_ARRAY4_WRAPPER(Real_array4_ref) +#define Real_ptr RAJAPERF_WRAPPER(Real_ptr) +#define Real_ptr_ptr RAJAPERF_WRAPPER(Real_ptr_ptr) +#define Complex_type RAJAPERF_WRAPPER(Complex_type) +#define Complex_ptr RAJAPERF_WRAPPER(Complex_ptr) +#define Data_type RAJAPERF_WRAPPER(Data_type) +#define Data_ptr RAJAPERF_WRAPPER(Data_ptr) + +#endif // closing endif for header file include guard diff --git a/src/common/CountingWrapper.hpp b/src/common/CountingWrapper.hpp new file mode 100644 index 000000000..451676369 --- /dev/null +++ b/src/common/CountingWrapper.hpp @@ -0,0 +1,1035 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-25, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJAPerf_CountingWrapper_HPP +#define RAJAPerf_CountingWrapper_HPP + +#include "common/RAJAPerfSuite.hpp" +#include "common/RPTypes.hpp" +#include "common/CountingData.hpp" + +#include +#include +#include +#include +#include + +namespace rajaperf +{ + +namespace counting +{ + +// Wrapper types that count operations +template < typename T > +struct Wrapper; + + +template < typename T > +struct is_wrapper +{ + static inline constexpr bool value = false; +}; + +template < typename T > +struct is_wrapper> +{ + static inline constexpr bool value = true; +}; + +template < typename T > +inline constexpr bool is_wrapper_v = is_wrapper::value; + + + +template < typename T > +concept Wrapped = is_wrapper_v>; + +template < typename T > +concept NonWrapped = !Wrapped; + +template < typename T > +concept WrappedVal = Wrapped && T::is_val; + +template < typename T > +concept WrappedArray = Wrapped && T::is_array; + +template < typename T > +concept WrappedPtr = Wrapped && T::is_ptr; + +template < typename T > +concept WrappedNonPtr = Wrapped && !T::is_ptr; + +template < typename T > +concept WrappedRef = Wrapped && T::is_ref; + + +template < typename T > +struct PointedToType +{ + using type = std::remove_reference_t())>; +}; + +template < WrappedPtr T > +struct PointedToType +{ + using type = typename std::remove_cvref_t::pointed_to_type; +}; + +template < typename T > +using pointed_to_type_t = typename PointedToType::type; + + +template < typename T > +struct WrappedType +{ + using type = T; +}; + +template < Wrapped T > +struct WrappedType +{ + using direct_type = typename std::remove_cvref_t::wrapped_type; + using const_type = std::conditional_t, std::add_const_t, direct_type>; + using lref_type = std::conditional_t, std::add_lvalue_reference_t, const_type>; + using rref_type = std::conditional_t, std::add_rvalue_reference_t, lref_type>; + using type = rref_type; +}; + +template < typename T > +using wrapped_type_t = typename WrappedType::type; + + +template < typename T > +concept raw_pointer = std::is_pointer_v; + +template < typename T > +concept pointer = raw_pointer || WrappedPtr; + +template < typename T > +concept convertible_to_pointer = std::convertible_to, pointed_to_type_t*>; + +template < typename T, typename U > +concept convertible_to = std::convertible_to, wrapped_type_t>; + +template < typename T > +concept integral = std::integral || + (Wrapped && std::integral); + + +template < typename T > +constexpr decltype(auto) get_value(T&& val, Size_type num_ops=0) +{ + if constexpr (Wrapped) { + return std::forward(val).get_native(num_ops); + } else { + return std::forward(val); + } +} + +template +struct add_all_extents_of_to +{ + using type = V; +}; + +template +struct add_all_extents_of_to +{ + using type = typename add_all_extents_of_to::type[]; +}; + +template +struct add_all_extents_of_to +{ + using type = typename add_all_extents_of_to::type[N]; +}; + +template +using add_all_extents_of_to_t = typename add_all_extents_of_to::type; + +template < typename T > +struct Wrapper +{ + static inline constexpr bool is_ref = std::is_reference_v; + static inline constexpr bool is_val = !is_ref; + static inline constexpr bool is_array = std::is_array_v>; + static inline constexpr bool is_ptr = std::is_pointer_v; + + template < typename U > + friend struct Wrapper; + + using wrapped_type = T; + + using value_type = std::conditional_t, T>; + using const_value_type = std::conditional_t>, + const value_type>; + + using member_type = std::conditional_t>, value_type>; + + using pointed_to_type = + std::conditional_t, + std::conditional_t, + value_type>>; + using const_pointed_to_type = + std::conditional_t, + std::conditional_t, + const_value_type>>; + + template < size_t... Is > + static constexpr size_t get_array_size(std::index_sequence) + { + return (... * std::extent_v); + } + /// + static constexpr size_t get_array_size() + { + if constexpr (is_array) { + using dims = std::make_index_sequence>; + return get_array_size(dims{}); + } + return size_t(0); + } + + explicit Wrapper(AllocationMetadata* allocation, member_type value) + : m_value(value) + , m_allocation(allocation) + { + } + + // allow default construction of non-ref values + Wrapper() + requires(is_val && !is_array) + : m_value() + { + } + /// + Wrapper(std::source_location location = std::source_location::current()) + requires(is_val && is_array) + : m_value() + { + registerArray(location); + m_allocation = CountingData::current_data->get_allocation( + static_cast(&m_value)); + } + + // allow implicit construction from non-wrapped values + template < convertible_to rhs_T > + Wrapper(rhs_T&& rhs) + requires(is_val && !is_array && !is_ptr) + : m_value(get_value(std::forward(rhs), 1)) + { + this->count(Operation::copy, 1); + } + /// + Wrapper(std::nullptr_t) + requires(is_val && !is_array && is_ptr) + : Wrapper() + { + } + /// + template < convertible_to rhs_T > + Wrapper(rhs_T&& rhs) + requires(is_val && !is_array && is_ptr) + : m_value(get_value(std::forward(rhs), 1)) + { + if constexpr (WrappedPtr) { + m_allocation = rhs.m_allocation; + } else { + m_allocation = CountingData::current_data->get_allocation( + static_cast(m_value)); + } + if (!m_allocation) { + std::ostringstream str; + str << "Couldn't find allocation "; + str << static_cast(get_value(std::forward(rhs))); + throw std::runtime_error(str.str()); + } + this->count(Operation::copy, 1); + } + /// + template < NonWrapped rhs_T > + Wrapper(rhs_T& rhs) + requires(is_ref) + : m_value(&rhs) + { + m_allocation = CountingData::current_data->get_allocation( + static_cast(m_value)); + if (!m_allocation) { + throw std::runtime_error("Couldn't find allocation"); + } + } + + // copy and move constructors + Wrapper(Wrapper const& rhs) + requires(is_val && !is_array) + : m_value(rhs.get_native()) + , m_allocation(rhs.m_allocation) + { + this->count(Operation::copy, 1); + } + /// + Wrapper(Wrapper && rhs) + requires(is_val && !is_array) + : m_value(std::move(rhs).get_native()) + , m_allocation(rhs.m_allocation) + { + this->count(Operation::copy, 1); + } + /// + Wrapper(Wrapper const& rhs) + requires(is_ref) + : m_value(rhs.m_value) + , m_allocation(rhs.m_allocation) + { + } + + // count assignments from non-wrapped values + template < NonWrapped rhs_T > + Wrapper& operator=(rhs_T&& rhs) + requires(!is_array) + { + this->set(std::forward(rhs)); + if constexpr (is_ptr) { + this->m_allocation = CountingData::current_data->get_allocation( + (void*)(m_value)); + if (!m_allocation) { + throw std::runtime_error("Couldn't find allocation"); + } + } + this->count(Operation::assign, 1); + return *this; + } + /// + Wrapper& operator=(std::nullptr_t) + requires(is_val && is_ptr) + { + return (*this) = Wrapper(); + } + + // count assignments from wrappers + Wrapper& operator=(Wrapper const& rhs) + requires(!is_array) + { + this->set(rhs.get_native()); + if constexpr (is_ptr) { + this->m_allocation = rhs.m_allocation; + } + this->count(Operation::assign, 1); + return *this; + } + /// + Wrapper& operator=(Wrapper&& rhs) + requires(!is_array) + { + this->set(std::move(rhs).get_native()); + if constexpr (is_ptr) { + this->m_allocation = rhs.m_allocation; + } + this->count(Operation::assign, 1); + return *this; + } + /// + template < Wrapped rhs_T > + Wrapper& operator=(rhs_T&& rhs) + requires(!is_array) + { + this->set(std::forward(rhs).get_native()); + if constexpr (is_ptr) { + this->m_allocation = rhs.m_allocation; + } + this->count(Operation::assign, 1); + return *this; + } + + ~Wrapper() + { + if constexpr (is_val && is_array) { + deregisterArray(); + } + } + + +#define RAJAPERF_DEFINE_WRAPPER_PRE_OPERATOR(op_name, op, op_enum) \ + auto& op_name() \ + requires(!is_array) \ + { \ + this->set(this->get_native() op 1); \ + this->count(op_enum, 1); \ + return *this; \ + } + + RAJAPERF_DEFINE_WRAPPER_PRE_OPERATOR(operator++, +, Operation::preinc) + RAJAPERF_DEFINE_WRAPPER_PRE_OPERATOR(operator--, -, Operation::predec) + + +#define RAJAPERF_DEFINE_WRAPPER_POST_OPERATOR(op_name, op, op_enum) \ + auto op_name(int) \ + requires(!is_array) \ + { \ + auto value = this->get_value_wrapper(); \ + this->set(value.get_native() op 1); \ + this->count(op_enum, 1); \ + return value; \ + } + + RAJAPERF_DEFINE_WRAPPER_POST_OPERATOR(operator++, +, Operation::postinc) + RAJAPERF_DEFINE_WRAPPER_POST_OPERATOR(operator--, -, Operation::postdec) + + +#define RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(op_name, op, op_enum) \ + template < WrappedNonPtr rhs_T > \ + auto& op_name(rhs_T const& rhs) \ + requires(!is_array && !is_ptr) \ + { \ + this->set(this->get_native() op rhs.get_native()); \ + this->count(op_enum, 1); \ + return *this; \ + } \ + template < NonWrapped rhs_T > \ + auto& op_name(rhs_T const& rhs) \ + requires(!is_array && !is_ptr) \ + { \ + this->set(this->get_native() op rhs); \ + this->count(op_enum, 1); \ + return *this; \ + } + + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator+=, +, Operation::add) + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator-=, -, Operation::sub) + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator*=, *, Operation::mult) + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator/=, /, Operation::div) + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator%=, %, Operation::rem) + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator&=, &, Operation::bit_and) + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator|=, |, Operation::bit_or) + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator^=, ^, Operation::bit_xor) + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator<<=, <<, Operation::bit_lsh) + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator>>=, >>, Operation::bit_rsh) + + +#define RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_POINTER_OPERATOR(op_name, op, op_enum) \ + template < WrappedNonPtr rhs_T > \ + auto& op_name(rhs_T const& rhs) \ + requires(is_ptr) \ + { \ + this->m_value op##= rhs.get_native(); \ + this->count(op_enum, 1); \ + return *this; \ + } \ + template < NonWrapped rhs_T > \ + auto& op_name(rhs_T const& rhs) \ + requires(is_ptr) \ + { \ + this->m_value op##= rhs; \ + this->count(op_enum, 1); \ + return *this; \ + } + + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_POINTER_OPERATOR(operator+=, +, Operation::add) + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_POINTER_OPERATOR(operator-=, -, Operation::sub) + + +#define RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_ATOMIC_OPERATOR(op_name, op, op_enum) \ + template < WrappedNonPtr rhs_T > \ + auto& op_name(rhs_T const& rhs) \ + requires(is_ref) \ + { \ + this->set(this->get_native(0) op rhs.get_native(), 0); \ + this->count(op_enum, 1); \ + return *this; \ + } \ + template < NonWrapped rhs_T > \ + auto& op_name(rhs_T const& rhs) \ + requires(is_ref) \ + { \ + this->set(this->get_native(0) op rhs, 0); \ + this->count(op_enum, 1); \ + return *this; \ + } + + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_ATOMIC_OPERATOR(atomic_add, +, Operation::atomic_add) + + + auto operator&() + requires(is_ref) + { + return Wrapper(m_allocation, m_value); + } + /// + auto operator&() const + requires(is_ref) + { + return Wrapper(m_allocation, m_value); + } + + + auto operator*() + requires(is_array || is_ptr) + { + if constexpr (is_val) { + if constexpr (is_array) { + return Wrapper(m_allocation, &*m_value); + } else { + return Wrapper(m_allocation, m_value); + } + } else { + if constexpr (is_array) { + return Wrapper(m_allocation, &**m_value); + } else { + return Wrapper(nullptr, *m_value); + } + } + } + /// + auto operator*() const + requires(is_array || is_ptr) + { + if constexpr (is_val) { + if constexpr (is_array) { + return Wrapper(m_allocation, &*m_value); + } else { + return Wrapper(m_allocation, m_value); + } + } else { + if constexpr (is_array) { + return Wrapper(m_allocation, &*(*m_value)); + } else { + return Wrapper(nullptr, (*m_value)); + } + } + } + + auto operator->() const + requires(is_ptr) + { + return m_value; + } + + + template < convertible_to I > + auto operator[](I&& i) + requires(is_array || is_ptr) + { + this->count(Operation::add, 1); + if constexpr (is_val) { + if constexpr (is_array) { + return Wrapper(m_allocation, &m_value[get_value(std::forward(i), 1)]); + } else { + return Wrapper(m_allocation, m_value+get_value(std::forward(i), 1)); + } + } else { + if constexpr (is_array) { + return Wrapper(m_allocation, &(*m_value)[get_value(std::forward(i), 1)]); + } else { + return Wrapper(nullptr, (*m_value)+get_value(std::forward(i), 1)); + } + } + } + /// + template < convertible_to I > + auto operator[](I&& i) const + requires(is_array || is_ptr) + { + this->count(Operation::add, 1); + if constexpr (is_val) { + if constexpr (is_array) { + return Wrapper(m_allocation, &m_value[get_value(std::forward(i), 1)]); + } else { + return Wrapper(m_allocation, m_value+get_value(std::forward(i), 1)); + } + } else { + if constexpr (is_array) { + return Wrapper(m_allocation, &(*m_value)[get_value(std::forward(i), 1)]); + } else { + return Wrapper(nullptr, (*m_value)+get_value(std::forward(i), 1)); + } + } + } + + operator auto() const + requires(!is_array) + { + this->count(Operation::copy, 1); + return this->get_native(); + } + /// explicit to avoid issues in ternary operators where value wrappers -> reference wrappers and vice-versa + explicit operator Wrapper() + requires(is_val && !(is_array || is_ptr)) + { + return Wrapper(nullptr, &m_value); + } + /// explicit to avoid issues in ternary operators where value wrappers -> reference wrappers and vice-versa + explicit operator Wrapper() const + requires(is_val && !(is_array || is_ptr)) + { + return Wrapper(nullptr, &m_value); + } + /// + operator Wrapper() + requires(is_val && (is_array || is_ptr)) + { + return Wrapper(m_allocation, &m_value); + } + /// + operator Wrapper() const + requires(is_val && (is_array || is_ptr)) + { + return Wrapper(m_allocation, &m_value); + } + /// + operator Wrapper() const + requires(is_ref && !(is_array || is_ptr)) + { + return Wrapper(nullptr, m_value); + } + /// + operator Wrapper() const + requires(is_ref && (is_array || is_ptr)) + { + return Wrapper(m_allocation, m_value); + } + + void swap(Wrapper& rhs) + requires(!is_array) // consider implementing array version later + { + using std::swap; + value_type rhs_tmp(std::move(rhs).get_native()); + rhs.set(std::move(*this).get_native()); + this->set(std::move(rhs_tmp)); + swap(this->m_allocation, rhs.m_allocation); + } + + void swap(Wrapper&& rhs) && + requires(is_ref && ! is_array) + { + using std::swap; + value_type rhs_tmp(std::move(rhs).get_native()); + rhs.set(std::move(*this).get_native()); + this->set(std::move(rhs_tmp)); + swap(this->m_allocation, rhs.m_allocation); + } + + + // internal interface methods, should only be used in this file + template < typename rhs_T > + void set(rhs_T&& rhs, Size_type num_ops = 1) + requires(!is_array) + { + if constexpr (is_val) { + m_value = std::forward(rhs); + } else { + this->count(Operation::store, num_ops); + *m_value = std::forward(rhs); + } + } + + // gets a copy of the value represented by this object + auto get_value_wrapper(Size_type num_ops = 1) const + requires(!is_array) + { + if constexpr (is_val) { + return Wrapper(m_allocation, m_value); + } else { + this->count(Operation::load, num_ops); + return Wrapper(nullptr, *m_value); + } + } + + // gets a reference to the underlying value + auto&& get_native(Size_type num_ops = 1) & + { + if constexpr (is_val) { + return m_value; + } else { + this->count(Operation::load, num_ops); + return *m_value; + } + } + /// + auto&& get_native(Size_type num_ops = 1) && + { + if constexpr (is_val) { + return std::move(m_value); + } else { + this->count(Operation::load, num_ops); + return *m_value; + } + } + /// + auto&& get_native(Size_type num_ops = 1) const& + { + if constexpr (is_val) { + return m_value; + } else { + this->count(Operation::load, num_ops); + return *m_value; + } + } + /// + auto&& get_native(Size_type num_ops = 1) const&& + { + if constexpr (is_val) { + return std::move(m_value); + } else { + this->count(Operation::load, num_ops); + return *m_value; + } + } + + + template < typename U = T > + void count(Operation op, Size_type num_ops) const + { + using V = std::decay_t; // decay arrays to pointers + + if (!CountingData::current_context) { + throw std::runtime_error("Can't count if there is no current context"); + } + + CountingData::current_context->operation_counters[ + Size_type(getOpType())][Size_type(op)] += num_ops; + + if constexpr (std::is_pointer_v && sizeof(std::remove_pointer_t) > 1) { + + if (op == Operation::add || op == Operation::sub) { + // Note that this fails to differentiate between + // adding/subtracting a pointer and an integer which entails a mult or bit_lsh + // and subtracting two pointers which entails a div or bit_rsh + auto is_pow_2 = [](size_t n) { return (n & (n-1)) == size_t(0); }; + Operation extra_op = is_pow_2(sizeof(std::remove_pointer_t)) + ? Operation::bit_lsh : Operation::mult ; + CountingData::current_context->operation_counters[ + Size_type(getOpType())][Size_type(extra_op)] += num_ops; + } + } + + if constexpr (std::is_reference_v) { + if (op == Operation::load || op == Operation::store || + op == Operation::atomic_add) { + + if (!m_allocation) { + throw std::runtime_error("Memory access to unknown allocation"); + } + + auto base_ptr = static_cast(m_allocation->ptr); + check_bounds(base_ptr); + + if (num_ops > Size_type(0)) { + CountingPoint point = CountingData::current_context->point; + MemoryAccess access = MemoryAccess::NumMemoryAccesses; + if (op == Operation::load) { + access = MemoryAccess::read; + } else if (op == Operation::store) { + access = MemoryAccess::write; + } else if (op == Operation::atomic_add) { + access = MemoryAccess::atomicModifyWrite; + } + Size_type offset = m_value - base_ptr; + CountingData::current_context->aloc_counts[m_allocation->idx]. + touch(point, access, offset, num_ops); + } + } + } + } + + void check_bounds(member_type base_ptr) const + requires(is_ref) + { + if (!base_ptr) { + throw std::runtime_error("Memory access to deallocated pointer"); + } + if (m_value < base_ptr) { + throw std::runtime_error("Memory access is out of bounds low"); + } + if (m_value >= (base_ptr + m_allocation->size)) { + throw std::runtime_error("Memory access is out of bounds high"); + } + } + + void registerArray(std::source_location location = std::source_location::current()) + requires(is_val && is_array) + { + CountingData::current_data->add_team_allocation( + get_type_name>(), + static_cast(&m_value), + get_array_size(), sizeof(std::remove_all_extents_t), + static_cast(&m_value), location); + } + + void deregisterArray(std::source_location location = std::source_location::current()) + requires(is_val && is_array) + { + CountingData::current_data->remove_allocation( + static_cast(&m_value), + static_cast(&m_value), location); + } + +private: + member_type m_value; + AllocationMetadata* m_allocation = nullptr; +}; + +template < typename U > +auto make_ValueWrapper(U&& value) +{ + return Wrapper>(value); +} + +// Operations with Wrapper types +// Some of these will be found before functions of the same name in the +// global namespace + +#define RAJAPERF_DEFINE_WRAPPER_UNARY_FUNCTION(op_name, op, op_enum) \ + template < typename T > \ + auto op_name(Wrapper const& obj) \ + { \ + using ::op; \ + auto value = make_ValueWrapper(op(obj.get_native())); \ + value.count(op_enum, 1); \ + return value; \ + } + +RAJAPERF_DEFINE_WRAPPER_UNARY_FUNCTION(exp, exp, Operation::exp) +RAJAPERF_DEFINE_WRAPPER_UNARY_FUNCTION(sqrt, sqrt, Operation::sqrt) +RAJAPERF_DEFINE_WRAPPER_UNARY_FUNCTION(fabs, fabs, Operation::abs) + + +#define RAJAPERF_DEFINE_WRAPPER_UNARY_OPERATOR(op_name, op, op_enum) \ + template < WrappedNonPtr T > \ + auto op_name(T const& obj) \ + { \ + auto value = make_ValueWrapper(op(obj.get_native())); \ + value.count(op_enum, 1); \ + return value; \ + } + +RAJAPERF_DEFINE_WRAPPER_UNARY_OPERATOR(operator+, +, Operation::uplus) +RAJAPERF_DEFINE_WRAPPER_UNARY_OPERATOR(operator-, -, Operation::uminus) + +template < WrappedPtr T > +auto operator+(T const& obj) +{ + Wrapper> value( + obj.m_allocation, +(obj.get_native())); + value.count(Operation::uplus, 1); + return value; +} + + +#define RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(op_name, op, op_enum) \ + template < NonWrapped lhs_T, Wrapped rhs_T > \ + auto op_name(lhs_T & lhs, \ + rhs_T const& rhs) \ + { \ + rhs.template count(op_enum, 1); \ + return lhs op rhs.get_native(); \ + } + +RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator+=, +=, Operation::add) +RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator-=, -=, Operation::sub) +RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator*=, *=, Operation::mult) +RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator/=, /=, Operation::div) +RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator%=, %=, Operation::rem) +RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator&=, &=, Operation::bit_and) +RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator|=, |=, Operation::bit_or) +RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator^=, ^=, Operation::bit_xor) +RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator<<=, <<=, Operation::bit_lsh) +RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator>>=, >>=, Operation::bit_rsh) + + +#define RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(op_name, op, op_enum) \ + template < WrappedNonPtr lhs_T, WrappedNonPtr rhs_T > \ + auto op_name(lhs_T const& lhs, \ + rhs_T const& rhs) \ + { \ + auto value = make_ValueWrapper(lhs.get_native() op rhs.get_native()); \ + value.count(op_enum, 1); \ + return value; \ + } \ + template < WrappedNonPtr lhs_T, NonWrapped rhs_T > \ + auto op_name(lhs_T const& lhs, \ + rhs_T const& rhs) \ + { \ + auto value = make_ValueWrapper(lhs.get_native() op rhs); \ + value.count(op_enum, 1); \ + return value; \ + } \ + template < NonWrapped lhs_T, WrappedNonPtr rhs_T > \ + auto op_name(lhs_T const& lhs, \ + rhs_T const& rhs) \ + { \ + auto value = make_ValueWrapper(lhs op rhs.get_native()); \ + value.count(op_enum, 1); \ + return value; \ + } + +RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(operator+, +, Operation::add) +RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(operator-, -, Operation::sub) +RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(operator*, *, Operation::mult) +RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(operator/, /, Operation::div) +RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(operator%, %, Operation::rem) +RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(operator&, &, Operation::bit_and) +RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(operator|, |, Operation::bit_or) +RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(operator^, ^, Operation::bit_xor) +RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(operator<<, <<, Operation::bit_lsh) +RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(operator>>, >>, Operation::bit_rsh) + + +template < typename lhs_T, typename rhs_T > +auto operator+(Wrapper const& lhs, + Wrapper const& rhs) +requires((Wrapper::is_ptr || Wrapper::is_ptr) && + !(Wrapper::is_ptr && Wrapper::is_ptr)) +{ + if constexpr (Wrapper::is_ptr) { + auto value = lhs.get_value_wrapper(); + value += rhs; + return value; + } else { + auto value = rhs.get_value_wrapper(); + value += lhs; + return value; + } +} +template < WrappedPtr lhs_T, NonWrapped rhs_T > +auto operator+(lhs_T const& lhs, + rhs_T const& rhs) +{ + auto value = lhs.get_value_wrapper(); + value += rhs; + return value; +} +template < NonWrapped lhs_T, WrappedPtr rhs_T > +auto operator+(lhs_T const& lhs, + rhs_T const& rhs) +{ + auto value = rhs.get_value_wrapper(); + value += lhs; + return value; +} + +template < WrappedPtr lhs_T, WrappedNonPtr rhs_T > +auto operator-(lhs_T const& lhs, + rhs_T const& rhs) +{ + auto value = lhs.get_value_wrapper(); + value -= rhs; + return value; +} +template < WrappedPtr lhs_T, NonWrapped rhs_T > +auto operator-(lhs_T const& lhs, + rhs_T const& rhs) +{ + auto value = lhs.get_value_wrapper(); + value -= rhs; + return value; +} + + +#define RAJAPERF_DEFINE_WRAPPER_COMPARISON_OPERATOR(op_name, op, op_enum) \ + template < typename lhs_T, typename rhs_T > \ + auto op_name(Wrapper const& lhs, \ + Wrapper const& rhs) \ + { \ + lhs.template count>(op_enum, 1); \ + return lhs.get_native() op rhs.get_native(); \ + } \ + template < typename lhs_T, NonWrapped rhs_T > \ + auto op_name(Wrapper const& lhs, \ + rhs_T const& rhs) \ + { \ + lhs.template count>(op_enum, 1); \ + return lhs.get_native() op rhs; \ + } \ + template < NonWrapped lhs_T, typename rhs_T > \ + auto op_name(lhs_T const& lhs, \ + Wrapper const& rhs) \ + { \ + rhs.template count>(op_enum, 1); \ + return lhs op rhs.get_native(); \ + } + +RAJAPERF_DEFINE_WRAPPER_COMPARISON_OPERATOR(operator==, ==, Operation::eq) +RAJAPERF_DEFINE_WRAPPER_COMPARISON_OPERATOR(operator!=, !=, Operation::ne) +RAJAPERF_DEFINE_WRAPPER_COMPARISON_OPERATOR(operator< , < , Operation::lt) +RAJAPERF_DEFINE_WRAPPER_COMPARISON_OPERATOR(operator<=, <=, Operation::le) +RAJAPERF_DEFINE_WRAPPER_COMPARISON_OPERATOR(operator> , > , Operation::gt) +RAJAPERF_DEFINE_WRAPPER_COMPARISON_OPERATOR(operator>=, >=, Operation::ge) + +template < Wrapped T > +void swap(T& lhs, T& rhs) +{ + lhs.swap(rhs); +} + +template < WrappedRef T > +void swap(T&& lhs, T&& rhs) +{ + std::move(lhs).swap(std::move(rhs)); +} + +// helper for getting right type +template < template typename T > +struct Array1WrapperHelper +{ + template < size_t N > + using type = Wrapper>; +}; +/// +template < template typename T > +struct Array2WrapperHelper +{ + template < size_t N0, size_t N1 > + using type = Wrapper>; +}; +/// +template < template typename T > +struct Array3WrapperHelper +{ + template < size_t N0, size_t N1, size_t N2 > + using type = Wrapper>; +}; +/// +template < template typename T > +struct Array4WrapperHelper +{ + template < size_t N0, size_t N1, size_t N2, size_t N3 > + using type = Wrapper>; +}; + +} // closing brace for counting namespace + +} // closing brace for rajaperf namespace + +namespace std +{ + +template < typename T > +struct iterator_traits<::rajaperf::counting::Wrapper> +{ + using difference_type = ::rajaperf::counting::Wrapper; + using value_type = ::rajaperf::counting::Wrapper>; + using pointer = ::rajaperf::counting::Wrapper; + using reference = ::rajaperf::counting::Wrapper; + using iterator_category = std::random_access_iterator_tag; +}; + +} // closing brace for std namespace + + +// Use this wrapper type in variable declarations in a kernel +// ex. +// RAJAPERF_WRAPPER(my_struct*) val; +// Note wrapping is done for most types in CountingMacros.hpp, but some types +// like structs specific to a kernel need to be wrapped manually +// Note do not use it if declaring variables with constant values +#define RAJAPERF_WRAPPER(type) type + +#endif // closing endif for header file include guard diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp index 56314f265..c3250853f 100644 --- a/src/common/DataUtils.hpp +++ b/src/common/DataUtils.hpp @@ -254,95 +254,52 @@ inline void moveData(DataSpace new_dataSpace, DataSpace old_dataSpace, } -template +template struct AutoDataMover { - AutoDataMover(DataSpace new_dataSpace, DataSpace old_dataSpace, - T*& ptr, Size_type len, Size_type align) - : m_ptr(&ptr) - , m_new_dataSpace(new_dataSpace) - , m_old_dataSpace(old_dataSpace) - , m_len(len) - , m_align(align) + using res_type = decltype(std::declval()()); + + AutoDataMover(Alloc alloc, CopyFreeReassign copyFreeReassign) + : m_alloc(std::move(alloc)) + , m_copyFreeReassign(std::move(copyFreeReassign)) { } AutoDataMover(AutoDataMover const&) = delete; AutoDataMover& operator=(AutoDataMover const&) = delete; - AutoDataMover(AutoDataMover&& rhs) - : m_ptr(std::exchange(rhs.m_ptr, nullptr)) - , m_new_ptr(std::exchange(rhs.m_new_ptr, nullptr)) - , m_new_dataSpace(rhs.m_new_dataSpace) - , m_old_dataSpace(rhs.m_old_dataSpace) - , m_len(rhs.m_len) - , m_align(rhs.m_align) - { } - AutoDataMover& operator=(AutoDataMover&& rhs) - { - finalize(); - m_ptr = std::exchange(rhs.m_ptr, nullptr); - m_new_ptr = std::exchange(rhs.m_new_ptr, nullptr); - m_new_dataSpace = rhs.m_new_dataSpace; - m_old_dataSpace = rhs.m_old_dataSpace; - m_len = rhs.m_len; - m_align = rhs.m_align; - return *this; - } + AutoDataMover(AutoDataMover&& rhs) = delete; + AutoDataMover& operator=(AutoDataMover&& rhs) = delete; ~AutoDataMover() { finalize(); } - // Get the pointer that will replace *m_ptr after finalize is called. - // Use this to populate pointers into the final data structure but do not - // dereference this pointer in setup code. - T* get_final_ptr() + [[nodiscard]] res_type get_final_ptr() { - if (m_ptr && !m_new_ptr) { - - if (m_new_dataSpace != m_old_dataSpace) { - - allocData(m_new_dataSpace, m_new_ptr, m_len, m_align); - - } else { - - m_new_ptr = *m_ptr; - - } + if (!m_allocated) { + m_allocated = true; + m_res = m_alloc(); } - return m_new_ptr; + return m_res; } void finalize() { - if (m_ptr) { - - get_final_ptr(); - - if (m_new_dataSpace != m_old_dataSpace) { - - copyData(m_new_dataSpace, m_new_ptr, m_old_dataSpace, *m_ptr, m_len); - - deallocData(m_old_dataSpace, *m_ptr); - - *m_ptr = m_new_ptr; - - } - - m_ptr = nullptr; - m_new_ptr = nullptr; + if (!m_finalized) { + m_finalized = true; + res_type res = get_final_ptr(); + m_copyFreeReassign(res); } } private: - T** m_ptr; - T* m_new_ptr = nullptr; - DataSpace m_new_dataSpace; - DataSpace m_old_dataSpace; - Size_type m_len; - Size_type m_align; + Alloc m_alloc; + CopyFreeReassign m_copyFreeReassign; + bool m_allocated = false; + bool m_finalized = false; + res_type m_res; }; /*! diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index b2f4e971f..e735c4516 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -40,7 +40,10 @@ #include #include #include + +#include #include +#include #include #if defined(_WIN32) @@ -387,6 +390,7 @@ void Executor::setupSuite() const std::set& run_kern = run_params.getKernelIDsToRun(); for (auto kid = run_kern.begin(); kid != run_kern.end(); ++kid) { kernels.push_back( getKernelObject(*kid, run_params) ); + kernels.back()->setCountedAttributes(); } const std::set& run_var = run_params.getVariantIDsToRun(); @@ -648,180 +652,395 @@ void Executor::writeKernelInfoSummary(ostream& str, #endif } + const bool skip_if_nonpositive = !to_file; + // // Set up column headers and column widths for kernel summary output. // - size_t kernel_width = 0; - Index_type psize_width = 0; - Index_type reps_width = 0; - Index_type itsrep_width = 0; - Index_type bytesMovedrep_width = 0; - Index_type flopsrep_width = 0; - Index_type bytesTouchedrep_width = 0; - Index_type bytesReadrep_width = 0; - Index_type bytesWrittenrep_width = 0; - Index_type bytesModifyWrittenrep_width = 0; - Index_type bytesAtomicModifyWrittenrep_width = 0; - Index_type bytesAllocatedrep_width = 0; - size_t checksumConsistency_width = 0; - size_t operationalComplexity_width = 0; - - size_t dash_width = 0; + string attr_category_head(""); + + string kern_head("Kernels"); + Index_type kercol_width = static_cast(kern_head.size()); for (size_t ik = 0; ik < kernels.size(); ++ik) { - kernel_width = max(kernel_width, kernels[ik]->getName().size()); - psize_width = max(psize_width, kernels[ik]->getActualProblemSize()); - reps_width = max(reps_width, kernels[ik]->getRunReps()); - itsrep_width = max(itsrep_width, kernels[ik]->getItsPerRep()); - bytesMovedrep_width = max(bytesMovedrep_width, kernels[ik]->getBytesMovedPerRep()); - flopsrep_width = max(flopsrep_width, kernels[ik]->getFLOPsPerRep()); - bytesTouchedrep_width = max(bytesTouchedrep_width, kernels[ik]->getBytesTouchedPerRep()); - bytesReadrep_width = max(bytesReadrep_width, kernels[ik]->getBytesReadPerRep()); - bytesWrittenrep_width = max(bytesWrittenrep_width, kernels[ik]->getBytesWrittenPerRep()); - bytesModifyWrittenrep_width = max(bytesModifyWrittenrep_width, kernels[ik]->getBytesModifyWrittenPerRep()); - bytesAtomicModifyWrittenrep_width = max(bytesAtomicModifyWrittenrep_width, kernels[ik]->getBytesAtomicModifyWrittenPerRep()); - bytesAllocatedrep_width = max(bytesAllocatedrep_width, kernels[ik]->getBytesAllocatedPerRep()); - checksumConsistency_width = max(checksumConsistency_width, getChecksumConsistencyName(kernels[ik]->getChecksumConsistency()).size()); - operationalComplexity_width = max(operationalComplexity_width, getComplexityName(kernels[ik]->getComplexity()).size()+3); + kercol_width = max(kercol_width, static_cast(kernels[ik]->getName().size())); } + kercol_width += 2; +// +// Set up separators and width parameters. +// const string sepchr(" , "); - string kernel_head("Kernel"); - kernel_width = max( kernel_head.size(), - kernel_width ) + 2; - dash_width += kernel_width; + Index_type current_width = 0; // does not contain kercol_width + const Index_type screen_width = 80; + const Index_type max_width = to_file ? std::numeric_limits::max() + : max(screen_width-kercol_width, screen_width/2); - double psize = log10( static_cast(psize_width) ); - string psize_head("Problem size"); - psize_width = max( static_cast(psize_head.size()), - static_cast(psize) ) + 3; - dash_width += psize_width + static_cast(sepchr.size()); - - double rsize = log10( static_cast(reps_width) ); - string rsize_head("Reps"); - reps_width = max( static_cast(rsize_head.size()), - static_cast(rsize) ) + 3; - dash_width += reps_width + static_cast(sepchr.size()); - - double irsize = log10( static_cast(itsrep_width) ); - string itsrep_head("Iterations/rep"); - itsrep_width = max( static_cast(itsrep_head.size()), - static_cast(irsize) ) + 3; - dash_width += itsrep_width + static_cast(sepchr.size()); - - string kernsrep_head("Kernels/rep"); - Index_type kernsrep_width = - max( static_cast(kernsrep_head.size()), - static_cast(4) ); - dash_width += kernsrep_width + static_cast(sepchr.size()); - - double brsize = log10( static_cast(bytesMovedrep_width) ); - string bytesMovedrep_head("BytesMoved/rep"); - bytesMovedrep_width = max( static_cast(bytesMovedrep_head.size()), - static_cast(brsize) ) + 3; - dash_width += bytesMovedrep_width + static_cast(sepchr.size()); - - double frsize = log10( static_cast(flopsrep_width) ); - string flopsrep_head("FLOPS/rep"); - flopsrep_width = max( static_cast(flopsrep_head.size()), - static_cast(frsize) ) + 3; - dash_width += flopsrep_width + static_cast(sepchr.size()); - - double btrsize = log10( static_cast(bytesTouchedrep_width) ); - string bytesTouchedrep_head("BytesTouched/rep"); - bytesTouchedrep_width = max( static_cast(bytesTouchedrep_head.size()), - static_cast(btrsize) ) + 3; - dash_width += bytesTouchedrep_width + static_cast(sepchr.size()); - - double brrsize = log10( static_cast(bytesReadrep_width) ); - string bytesReadrep_head("BytesRead/rep"); - bytesReadrep_width = max( static_cast(bytesReadrep_head.size()), - static_cast(brrsize) ) + 3; - dash_width += bytesReadrep_width + static_cast(sepchr.size()); - - double bwrsize = log10( static_cast(bytesWrittenrep_width) ); - string bytesWrittenrep_head("BytesWritten/rep"); - bytesWrittenrep_width = max( static_cast(bytesWrittenrep_head.size()), - static_cast(bwrsize) ) + 3; - dash_width += bytesWrittenrep_width + static_cast(sepchr.size()); - - double bmwrsize = log10( static_cast(bytesModifyWrittenrep_width) ); - string bytesModifyWrittenrep_head("BytesModifyWritten/rep"); - bytesModifyWrittenrep_width = max( static_cast(bytesModifyWrittenrep_head.size()), - static_cast(bmwrsize) ) + 3; - dash_width += bytesModifyWrittenrep_width + static_cast(sepchr.size()); - - double bamrrsize = log10( static_cast(bytesAtomicModifyWrittenrep_width) ); - string bytesAtomicModifyWrittenrep_head("BytesAtomicModifyWritten/rep"); - bytesAtomicModifyWrittenrep_width = max( static_cast(bytesAtomicModifyWrittenrep_head.size()), - static_cast(bamrrsize) ) + 3; - dash_width += bytesAtomicModifyWrittenrep_width + static_cast(sepchr.size()); - - double barsize = log10( static_cast(bytesAllocatedrep_width) ); - string bytesAllocatedrep_head("BytesAllocated/rep"); - bytesAllocatedrep_width = max( static_cast(bytesAllocatedrep_head.size()), - static_cast(barsize) ) + 3; - dash_width += bytesAllocatedrep_width + static_cast(sepchr.size()); - - string checksumConsistency_head("ChecksumConsistency"); - checksumConsistency_width = max( checksumConsistency_head.size(), - checksumConsistency_width ) + 2; - dash_width += checksumConsistency_width + static_cast(sepchr.size()); - - string operationalComplexity_head("OperationalComplexity"); - operationalComplexity_width = max( operationalComplexity_head.size(), - operationalComplexity_width ) + 2; - dash_width += operationalComplexity_width + static_cast(sepchr.size()); +// +// Set up storage for attributes which will become the columns. +// + struct Attribute + { + std::string category_name; + std::string name; + Index_type width; + std::function getter; + }; - str < attrs; + +// +// function used to print the table, includes the kernel column and attr columns. +// Clears attr columns after printing to make using more than once easier. +// + auto print_attr_table = [&]() { + + // print row of categories + str <getName(); + for (Attribute const& attr : attrs) { + str << sepchr <(category_name.size()), + static_cast(name.size()) ); + + using value_type = decltype(getter(kernels[0])); + + std::function attr_getter; + + if constexpr (std::integral) { + + Index_type max_value = std::numeric_limits::min(); + for (size_t ik = 0; ik < kernels.size(); ++ik) { + max_value = max(max_value, getter(kernels[ik])); + } + if (skip_if_nonpositive && max_value <= static_cast(0)) return; + max_value = max(max_value, static_cast(1)); + double value_width = log10(static_cast(max_value)) + 1.0; + width = max( width, static_cast(value_width) ); + + attr_getter = [=](KernelBase const* kern) { + std::ostringstream str; + str << getter(kern); + return str.str(); + }; + + } else if constexpr (std::convertible_to) { + + Index_type max_size = 0; + for (size_t ik = 0; ik < kernels.size(); ++ik) { + auto value = getter(kernels[ik]); + std::string_view view = value; + max_size = max( max_size, static_cast(view.size()) ); + } + width = max( width, max_size ); + + attr_getter = [=](KernelBase const* kern) { + return std::string(getter(kern)); + }; + + } else { + + static_assert(false); + } + + width += 2; + Index_type width_with_sep = static_cast(sepchr.size()) + width; + + if (current_width + width_with_sep > max_width) { + print_attr_table(); + } + + current_width += width_with_sep; + attrs.emplace_back(Attribute{category_name, name, width, std::move(attr_getter)}); + }; + +// +// user settable attributes +// + add_attr("Input", "Problem size", [](KernelBase const* kernel){ + return static_cast(kernel->getActualProblemSize()); + }); + + add_attr("Input", "Reps", [](KernelBase const* kernel){ + return static_cast(kernel->getRunReps()); + }); + + if ( !to_file && current_width > 0 ) { + print_attr_table(); } - for (size_t ik = 0; ik < kernels.size(); ++ik) { - KernelBase* kern = kernels[ik]; - str <getName() - << sepchr <getActualProblemSize() - << sepchr <getRunReps() - << sepchr <getItsPerRep() - << sepchr <getKernelsPerRep() - << sepchr <getBytesMovedPerRep() - << sepchr <getFLOPsPerRep() - << sepchr <getBytesTouchedPerRep() - << sepchr <getBytesReadPerRep() - << sepchr <getBytesWrittenPerRep() - << sepchr <getBytesModifyWrittenPerRep() - << sepchr <getBytesAtomicModifyWrittenPerRep() - << sepchr <getBytesAllocatedPerRep() - << sepchr <getChecksumConsistency()) - << sepchr <getComplexity())+")") - << endl; +// +// manually counted attributes +// + add_attr("Estimate", "Iterations/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getItsPerRep()); + }); + + add_attr("Estimate", "Kernels/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getKernelsPerRep()); + }); + + add_attr("Estimate", "BytesMoved/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getBytesMovedPerRep()); + }); + + add_attr("Estimate", "FLOPS/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getFLOPsPerRep()); + }); + + add_attr("Estimate", "BytesTouched/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getBytesTouchedPerRep()); + }); + + add_attr("Estimate", "BytesRead/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getBytesReadPerRep()); + }); + + add_attr("Estimate", "BytesWritten/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getBytesWrittenPerRep()); + }); + + add_attr("Estimate", "BytesModifyWritten/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getBytesModifyWrittenPerRep()); + }); + + add_attr("Estimate", "BytesAtomicModifyWritten/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getBytesAtomicModifyWrittenPerRep()); + }); + + add_attr("Estimate", "BytesAllocated/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getBytesAllocatedPerRep()); + }); + + add_attr("Estimate", "ChecksumConsistency", [](KernelBase const* kernel){ + return getChecksumConsistencyName(kernel->getChecksumConsistency()); + }); + + add_attr("Estimate", "OperationalComplexity", [](KernelBase const* kernel){ + return "O("+getComplexityName(kernel->getComplexity())+")"; + }); + + if ( !to_file && current_width > 0 ) { + print_attr_table(); + } + +// +// automatically counted high level attributes +// + add_attr("Counted", "Iterations/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getCountedItsPerRep()); + }); + + add_attr("Counted", "ParallelIterations/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getCountedParItsPerRep()); + }); + + add_attr("Counted", "MaxLoopNestDepth", [](KernelBase const* kernel){ + return static_cast(kernel->getCountedMaxLoopNestDepth()); + }); + + add_attr("Counted", "MaxParallelLoopNestDepth", [](KernelBase const* kernel){ + return static_cast(kernel->getCountedMaxParLoopNestDepth()); + }); + + add_attr("Counted", "Kernels/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getCountedKernelsPerRep()); + }); + + add_attr("Counted", "Synchronizes/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getCountedSyncsPerRep()); + }); + + add_attr("Counted", "TeamSynchronizes/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getCountedTeamSyncsPerRep()); + }); + + if ( !to_file && current_width > 0 ) { + print_attr_table(); + } + +// +// automatically counted memory attributes, at the per loop usage granularity +// + for (Size_type g = 0; g < Size_type(counting::AllocationGroup::NumAllocationGroups); ++g) { + auto gg = counting::AllocationGroup(g); + + std::string num_allocations_name = std::format("{}NumAllocations", + counting::getAllocationGroupName(gg)); + + add_attr("Counted", num_allocations_name, [gg](KernelBase const* kernel){ + return static_cast(kernel->getCountedNumAllocations(gg)); + }); + + std::string bytes_allocated_name = std::format("{}AllocatedBytes", + counting::getAllocationGroupName(gg)); + + add_attr("Counted", bytes_allocated_name, [gg](KernelBase const* kernel){ + return static_cast(kernel->getCountedAllocatedBytes(gg)); + }); + + std::string bytes_moved_total_name = std::format("{}BytesMovedTotal/rep", + counting::getAllocationGroupName(gg)); + + add_attr("Counted", bytes_moved_total_name, [gg](KernelBase const* kernel){ + return static_cast( + kernel->getCountedTotalBytesMoved(gg)); + }); + + std::string bytes_touched_total_name = std::format("{}BytesTouchedTotal/rep", + counting::getAllocationGroupName(gg)); + + add_attr("Counted", bytes_touched_total_name, [gg](KernelBase const* kernel){ + return static_cast( + kernel->getCountedTotalBytesTouched(gg)); + }); + + for (Size_type a = 0; a < Size_type(counting::MemoryAccess::NumMemoryAccesses); ++a) { + auto aa = counting::MemoryAccess(a); + + std::string bytes_total_accessed_name = std::format("{}Bytes{}Total/rep", + counting::getAllocationGroupName(gg), + counting::getMemoryAccessNamePastTenseTitle(aa)); + add_attr("Counted", bytes_total_accessed_name, [gg, aa](KernelBase const* kernel){ + return static_cast( + kernel->getCountedTotalBytesPerAccess(gg, aa)); + }); + + } + + for (Size_type p = 0; p < Size_type(counting::CountingPoint::NumCountingPoints); ++p) { + auto pp = counting::CountingPoint(p); + + std::string bytes_moved_name = std::format("{}BytesMoved/{}", + counting::getAllocationGroupName(gg), + counting::getCountingPointName(pp)); + + add_attr("Counted", bytes_moved_name, [pp, gg](KernelBase const* kernel){ + return static_cast( + kernel->getCountedBytesMoved(pp, gg)); + }); + + std::string bytes_touched_name = std::format("{}BytesTouched/{}", + counting::getAllocationGroupName(gg), + counting::getCountingPointName(pp)); + + add_attr("Counted", bytes_touched_name, [pp, gg](KernelBase const* kernel){ + return static_cast( + kernel->getCountedBytesTouched(pp, gg)); + }); + + for (Size_type a = 0; a < Size_type(counting::MemoryAccess::NumMemoryAccesses); ++a) { + auto aa = counting::MemoryAccess(a); + + std::string bytes_accessed_name = std::format("{}Bytes{}/{}", + counting::getAllocationGroupName(gg), + counting::getMemoryAccessNamePastTenseTitle(aa), + counting::getCountingPointName(pp)); + add_attr("Counted", bytes_accessed_name, [pp, gg, aa](KernelBase const* kernel){ + return static_cast( + kernel->getCountedBytesPerAccess(pp, gg, aa)); + }); + + } + + } + } + + if ( !to_file && current_width > 0 ) { + print_attr_table(); + } + +// +// automatically counted operations attributes +// + for (Size_type ot = 0; ot < Size_type(counting::OpType::NumOpTypes); ++ot) { + + std::string opTypeName = counting::getOpTypeName(counting::OpType(ot)); + + add_attr("Counted", opTypeName+"_ops/rep", [ot](KernelBase const* kernel){ + return static_cast(kernel->getCountedArithmeticOpsPerRep(counting::OpType(ot))); + }, skip_if_nonpositive); + + for (Size_type op = 0; op < Size_type(counting::Operation::NumOperations); ++op) { + + std::string opName = counting::getOperationName(counting::Operation(op)); + + add_attr("Counted", opTypeName+"_"+opName+"/rep", [ot, op](KernelBase const* kernel){ + return static_cast(kernel->getCountedOpsPerRep(counting::OpType(ot), counting::Operation(op))); + }, skip_if_nonpositive); + + } + + if ( !to_file && current_width > 0 ) { + print_attr_table(); + } + + } + + if (current_width > 0) { + print_attr_table(); } str.flush(); } +void Executor::writeKernelCounterSummary(ostream& str, + vector const& kernels) const +{ + for (size_t ik = 0; ik < kernels.size(); ++ik) { + str << "\n/******** Kernel " << kernels[ik]->getName() << " ********/\n"; + kernels[ik]->printCounters(str); + } + str.flush(); +} + + void Executor::writeKernelRunDataSummary(ostream& str, vector const& kernels) const { @@ -1243,6 +1462,11 @@ void Executor::outputRunData() file = openOutputFile(out_fprefix + "-fom.csv"); writeFOMReport(*file, kernels, fom_groups); } + + file = openOutputFile(out_fprefix + "-counters.txt"); + if ( *file ) { + writeKernelCounterSummary(*file, kernels); + } } #if defined(RAJA_PERFSUITE_USE_CALIPER) @@ -1297,6 +1521,10 @@ void Executor::outputRunData() writeFOMReport(*file, mykernel, fom_groups); } + if ( *file ) { + writeKernelCounterSummary(*file, mykernel); + } + } } diff --git a/src/common/Executor.hpp b/src/common/Executor.hpp index 1d5fe973c..2f21ee043 100644 --- a/src/common/Executor.hpp +++ b/src/common/Executor.hpp @@ -74,6 +74,8 @@ class Executor std::unique_ptr openOutputFile(const std::string& filename) const; void writeSeparator(std::ostream& file); + void writeKernelCounterSummary(std::ostream& str, + std::vector const& kernels) const; void writeKernelInfoSummary(std::ostream& str, std::vector const& kernels, diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index f9f8ee842..32bce3c21 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -15,6 +15,8 @@ #include "common/DataUtils.hpp" #include "common/RunParams.hpp" #include "common/GPUUtils.hpp" +#include "common/CountingData.hpp" +#include "common/CountingWrapper.hpp" #include "RAJA/util/Timer.hpp" #include "RAJA/util/reduce.hpp" @@ -39,6 +41,10 @@ #include #include #include +#include +#include +#include +#include #if defined(RAJA_PERFSUITE_USE_CALIPER) @@ -207,25 +213,107 @@ class KernelBase Index_type getDefaultProblemSize() const { return default_prob_size; } Index_type getActualProblemSize() const { return actual_prob_size; } Index_type getDefaultReps() const { return default_reps; } + Index_type getTargetProblemSize() const; Index_type getRunReps() const { return s_warmup_run ? 1 : actual_reps; } - Index_type getItsPerRep() const { return its_per_rep; } - Index_type getKernelsPerRep() const { return kernels_per_rep; } + + Index_type getItsPerRep() const { return its_per_rep; }; + Index_type getKernelsPerRep() const { return kernels_per_rep; }; + Index_type getBytesAllocatedPerRep() const { return bytes_allocated_per_rep; } - Index_type getBytesMovedPerRep() const { return bytes_read_per_rep + bytes_written_per_rep + 2*bytes_modify_written_per_rep + 2*bytes_atomic_modify_written_per_rep; } // count modify_write operations twice to get the memory traffic + Index_type getBytesMovedPerRep() const { return bytes_read_per_rep + bytes_written_per_rep + 2*bytes_modify_written_per_rep + 2*bytes_atomic_modify_written_per_rep; } // count atomic_modify_write operations as a read and a write to match previous counting Index_type getBytesTouchedPerRep() const { return bytes_read_per_rep + bytes_written_per_rep + bytes_modify_written_per_rep + bytes_atomic_modify_written_per_rep; } // count modify_write operations once to get the data size only Index_type getBytesReadPerRep() const { return bytes_read_per_rep + bytes_modify_written_per_rep; } Index_type getBytesWrittenPerRep() const { return bytes_written_per_rep + bytes_modify_written_per_rep; } Index_type getBytesModifyWrittenPerRep() const { return bytes_modify_written_per_rep; } Index_type getBytesAtomicModifyWrittenPerRep() const { return bytes_atomic_modify_written_per_rep; } + Index_type getFLOPsPerRep() const { return FLOPs_per_rep; } + + + Index_type getCountedItsPerRep() const { return countingData ? (countingData->all_it_per_rep_counter) : -1; } + Index_type getCountedParItsPerRep() const { return countingData ? (countingData->par_it_per_rep_counter) : -1; } + Index_type getCountedMaxLoopNestDepth() const { return countingData ? (countingData->max_all_loop_depth) : -1; } + Index_type getCountedMaxParLoopNestDepth() const { return countingData ? (countingData->max_par_loop_depth) : -1; } + Index_type getCountedKernelsPerRep() const { return countingData ? countingData->kernel_per_rep_counter : -1; } + Index_type getCountedSyncsPerRep() const { return countingData ? countingData->par_sync_per_rep_counter : -1; } + Index_type getCountedTeamSyncsPerRep() const { return countingData ? countingData->team_sync_per_rep_counter : -1; } + Index_type getCountedNumAllocations(counting::AllocationGroup g) const { return countingData ? countingData->memory_allocations[Size_type(g)] : -1; } + Index_type getCountedAllocatedBytes(counting::AllocationGroup g) const { return countingData ? countingData->memory_bytes[Size_type(g)] : -1; } + + Index_type getCountedTotalBytesMoved(counting::AllocationGroup g) const + { + Index_type count = -1; + if (countingData) { + count = 0; + for (Size_type a = 0; a < Size_type(counting::MemoryAccess::NumMemoryAccesses); ++a) { + count += countingData->memory_total_bytes[Size_type(g)].accessed[a]; + if (counting::MemoryAccess(a) == counting::MemoryAccess::atomicModifyWrite) { + count += countingData->memory_total_bytes[Size_type(g)].accessed[a]; // count twice, as both a read and a write + } + } + }; + return count; + } + Index_type getCountedTotalBytesTouched(counting::AllocationGroup g) const + { + Index_type count = -1; + if (countingData) { + count = 0; + for (Size_type a = 0; a < Size_type(counting::MemoryAccess::NumMemoryAccesses); ++a) { + count += countingData->memory_total_bytes[Size_type(g)].accessed[a]; + } + }; + return count; + } + Index_type getCountedTotalBytesPerAccess(counting::AllocationGroup g, counting::MemoryAccess ma) const + { return countingData ? countingData->memory_total_bytes[Size_type(g)].accessed[Size_type(ma)] : -1; } + + // count atomic_modify_write operations as a read and a write to match previous counting + Index_type getCountedBytesTouched(counting::CountingPoint p, counting::AllocationGroup g) const + { return countingData ? countingData->memory_totals_bytes[Size_type(p)][Size_type(g)].touched : -1; } + Index_type getCountedBytesMoved(counting::CountingPoint p, counting::AllocationGroup g) const + { + Index_type count = -1; + if (countingData) { + count = 0; + for (Size_type a = 0; a < Size_type(counting::MemoryAccess::NumMemoryAccesses); ++a) { + count += countingData->memory_totals_bytes[Size_type(p)][Size_type(g)].accessed[a]; + if (counting::MemoryAccess(a) == counting::MemoryAccess::atomicModifyWrite) { + count += countingData->memory_totals_bytes[Size_type(p)][Size_type(g)].accessed[a]; // count twice, as both a read and a write + } + } + }; + return count; + } + Index_type getCountedBytesPerAccess(counting::CountingPoint p, counting::AllocationGroup g, counting::MemoryAccess ma) const + { return countingData ? countingData->memory_totals_bytes[Size_type(p)][Size_type(g)].accessed[Size_type(ma)] : -1; } + + Index_type getCountedOpsPerRep(counting::OpType ot, counting::Operation op) const { return countingData ? countingData->operation_counters[Size_type(ot)][Size_type(op)] : -1; } + + Index_type getCountedArithmeticOpsPerRep(counting::OpType ot) const + { + Index_type count = -1; + if (countingData) { + // count a subset of operations including things like add, sub, mult, div, abs, sqrt, but not assign, eq, ne, lt, le, gt, or ge + count = 0; + for (Size_type op = Size_type(counting::Operation::FLOP_begin); + op < Size_type(counting::Operation::FLOP_end); ++op) { + count += countingData->operation_counters[Size_type(ot)][op]; + } + } + return count; + } + + double getBlockSize() const { return kernel_block_size; } + ChecksumConsistency getChecksumConsistency() const { return checksum_consistency; }; Checksum_type getChecksumTolerance() const { return checksum_tolerance; } Complexity getComplexity() const { return complexity; }; + Index_type getMaxPerfectLoopDimensions() const { return num_nested_perfect_loops; }; Index_type getProblemDimensionality() const { return problem_dimensionality; }; - bool usesFeature(FeatureID fid) const { return uses_feature[fid]; }; bool hasVariantDefined(VariantID vid) const @@ -429,151 +517,259 @@ class KernelBase DataSpace getReductionDataSpace(VariantID vid) const; DataSpace getMPIDataSpace(VariantID vid) const; - template - void allocData(DataSpace dataSpace, T& ptr, Size_type len) + + + virtual void setCountedAttributes() {}; // + + + counting::ScopedContext initializeCounters( + std::initializer_list wrapper_formats, + std::source_location location = std::source_location::current()) { - rajaperf::allocData(dataSpace, - ptr, len, getDataAlignment()); + countingData = std::make_unique(); + countingData->set_formats(wrapper_formats); + enable_data_registration = true; + return countingData->create_context("", location); } - template - void allocAndInitData(DataSpace dataSpace, T*& ptr, Size_type len) + void finalizeCounters(counting::ScopedContext& context, + std::source_location location = std::source_location::current()) { - rajaperf::allocAndInitData(dataSpace, - ptr, len, getDataAlignment()); + context.release(); + enable_data_registration = false; + countingData->finalize_context(location); } - template - void allocAndInitDataConst(DataSpace dataSpace, T*& ptr, Size_type len, V val) + void printCounters(std::ostream& str) const { - rajaperf::allocAndInitDataConst(dataSpace, - ptr, len, getDataAlignment(), val); + if (countingData) { + countingData->print(str); + } } - template - void allocAndInitDataRandSign(DataSpace dataSpace, T*& ptr, Size_type len) + + void registerData(counting::pointer auto& ptr, + counting::integral auto const& len, + counting::raw_pointer auto ptr_ptr, + std::source_location location = std::source_location::current()) { - rajaperf::allocAndInitDataRandSign(dataSpace, - ptr, len, getDataAlignment()); + using pointed_to_type = counting::pointed_to_type_t; + if (!enable_data_registration) return; + countingData->add_allocation( + counting::get_type_name(), + static_cast(counting::get_value(ptr)), + counting::get_value(len), sizeof(pointed_to_type), + static_cast(ptr_ptr), location); } - template - void allocAndInitDataRandValue(DataSpace dataSpace, T*& ptr, Size_type len) + void deRegisterData(counting::pointer auto& ptr, + counting::raw_pointer auto ptr_ptr, + std::source_location location = std::source_location::current()) { - rajaperf::allocAndInitDataRandValue(dataSpace, - ptr, len, getDataAlignment()); + if (!enable_data_registration) return; + countingData->remove_allocation( + static_cast(counting::get_value(ptr)), + static_cast(ptr_ptr), location); } - template - rajaperf::AutoDataMover scopedMoveData(DataSpace dataSpace, T*& ptr, Size_type len) + void allocData(DataSpace dataSpace, counting::pointer auto& ptr_in, + counting::integral auto const& len_in, + std::source_location location = std::source_location::current()) + { + auto ptr = counting::get_value(ptr_in); + Size_type len = counting::get_value(len_in); + rajaperf::allocData(dataSpace, ptr, len, getDataAlignment()); + registerData(ptr, len, &counting::get_value(ptr_in), location); + ptr_in = ptr; + } + + void allocAndInitData(DataSpace dataSpace, counting::pointer auto& ptr_in, + counting::integral auto const& len_in, + std::source_location location = std::source_location::current()) + { + auto ptr = counting::get_value(ptr_in); + Size_type len = counting::get_value(len_in); + rajaperf::allocAndInitData(dataSpace, ptr, len, getDataAlignment()); + registerData(ptr, len, &counting::get_value(ptr_in), location); + ptr_in = ptr; + } + + void allocAndInitDataConst(DataSpace dataSpace, counting::pointer auto& ptr_in, + counting::integral auto const& len_in, auto const& val, + std::source_location location = std::source_location::current()) + { + auto ptr = counting::get_value(ptr_in); + Size_type len = counting::get_value(len_in); + rajaperf::allocAndInitDataConst(dataSpace, ptr, len, getDataAlignment(), counting::get_value(val)); + registerData(ptr, len, &counting::get_value(ptr_in), location); + ptr_in = ptr; + } + + void allocAndInitDataRandSign(DataSpace dataSpace, counting::pointer auto& ptr_in, + counting::integral auto const& len_in, + std::source_location location = std::source_location::current()) + { + auto ptr = counting::get_value(ptr_in); + Size_type len = counting::get_value(len_in); + rajaperf::allocAndInitDataRandSign(dataSpace, ptr, len, getDataAlignment()); + registerData(ptr, len, &counting::get_value(ptr_in), location); + ptr_in = ptr; + } + + void allocAndInitDataRandValue(DataSpace dataSpace, counting::pointer auto& ptr_in, + counting::integral auto const& len_in, + std::source_location location = std::source_location::current()) + { + auto ptr = counting::get_value(ptr_in); + Size_type len = counting::get_value(len_in); + rajaperf::allocAndInitDataRandValue(dataSpace, ptr, len, getDataAlignment()); + registerData(ptr, len, &counting::get_value(ptr_in), location); + ptr_in = ptr; + } + + auto scopedMoveDataForInit(DataSpace dataSpace, DataSpace hds, counting::raw_pointer auto& ptr, + counting::integral auto const& len_in, + std::source_location location = std::source_location::current()) + { + Size_type len = counting::get_value(len_in); + Size_type align = getDataAlignment(); + KernelBase& self = *this; + return rajaperf::AutoDataMover([=, &ptr](){ + + auto new_ptr = ptr; + if (dataSpace != hds) { + rajaperf::allocData(dataSpace, new_ptr, len, align); + } + return new_ptr; + + }, [=, &self, &ptr](auto new_ptr){ + + if (dataSpace != hds) { + rajaperf::copyData(dataSpace, new_ptr, hds, ptr, len); + rajaperf::deallocData(hds, ptr); + ptr = new_ptr; + } + + self.registerData(ptr, len, &ptr, location); + + }); + } + + auto allocDataForInit(DataSpace dataSpace, counting::raw_pointer auto& ptr, + counting::integral auto const& len_in, + std::source_location location = std::source_location::current()) { DataSpace hds = rajaperf::hostCopyDataSpace(dataSpace); - rajaperf::moveData(hds, dataSpace, ptr, len, getDataAlignment()); - return {dataSpace, hds, ptr, len, getDataAlignment()}; + Size_type len = counting::get_value(len_in); + rajaperf::allocData(hds, ptr, len, getDataAlignment()); + // don't register temporary data + return scopedMoveDataForInit(dataSpace, hds, ptr, len, location); } - template - void copyData(DataSpace dst_dataSpace, T* dst_ptr, - DataSpace src_dataSpace, const T* src_ptr, - Size_type len) + auto allocAndInitDataForInit(DataSpace dataSpace, counting::raw_pointer auto& ptr, + counting::integral auto const& len_in, + std::source_location location = std::source_location::current()) { - rajaperf::copyData(dst_dataSpace, dst_ptr, src_dataSpace, src_ptr, len); + DataSpace hds = rajaperf::hostCopyDataSpace(dataSpace); + Size_type len = counting::get_value(len_in); + rajaperf::allocAndInitData(hds, ptr, len, getDataAlignment()); + // don't register temporary data + return scopedMoveDataForInit(dataSpace, hds, ptr, len, location); } - template - void deallocData(DataSpace dataSpace, T& ptr) + auto allocAndInitDataConstForInit(DataSpace dataSpace, counting::raw_pointer auto& ptr, + counting::integral auto const& len_in, auto const& val, + std::source_location location = std::source_location::current()) { - rajaperf::deallocData(dataSpace, ptr); + DataSpace hds = rajaperf::hostCopyDataSpace(dataSpace); + Size_type len = counting::get_value(len_in); + rajaperf::allocAndInitDataConst(hds, ptr, len, getDataAlignment(), counting::get_value(val)); + // don't register temporary data + return scopedMoveDataForInit(dataSpace, hds, ptr, len, location); } - template - void allocData(T*& ptr, Size_type len, VariantID vid) + void copyData(DataSpace dst_dataSpace, counting::convertible_to_pointer auto const& dst, + DataSpace src_dataSpace, counting::convertible_to_pointer auto const& src, + counting::integral auto const& len) { - rajaperf::allocData(getDataSpace(vid), - ptr, len, getDataAlignment()); + rajaperf::copyData(dst_dataSpace, counting::get_value(dst), + src_dataSpace, counting::get_value(src), len); } - template - void allocAndCopyHostData(T*& dst_ptr, - const T* src_ptr, - Size_type len, - VariantID vid) + void deallocData(DataSpace dataSpace, counting::pointer auto& ptr_in, + std::source_location location = std::source_location::current()) { - rajaperf::allocData(getDataSpace(vid), - dst_ptr, len, getDataAlignment()); + auto ptr = counting::get_value(ptr_in); + deRegisterData(ptr, &counting::get_value(ptr_in), location); + rajaperf::deallocData(dataSpace, ptr); + ptr_in = nullptr; + } + - rajaperf::copyData(getDataSpace(vid), - dst_ptr, DataSpace::Host, src_ptr, len); + void allocData(counting::pointer auto& ptr, counting::integral auto const& len, VariantID vid, + std::source_location location = std::source_location::current()) + { + allocData(getDataSpace(vid), ptr, len, location); } - template - void allocAndInitData(T*& ptr, Size_type len, VariantID vid) + void allocAndCopyHostData(counting::pointer auto& dst_ptr, + counting::convertible_to_pointer auto const& src, + counting::integral auto const& len, + VariantID vid, + std::source_location location = std::source_location::current()) { - rajaperf::allocAndInitData(getDataSpace(vid), - ptr, len, getDataAlignment()); + allocData(getDataSpace(vid), dst_ptr, len, location); + copyData(getDataSpace(vid), dst_ptr, DataSpace::Host, src, len); } - template - void allocAndInitDataConst(T*& ptr, Size_type len, V val, VariantID vid) + void allocAndInitData(counting::pointer auto& ptr, counting::integral auto const& len, VariantID vid, + std::source_location location = std::source_location::current()) { - rajaperf::allocAndInitDataConst(getDataSpace(vid), - ptr, len, getDataAlignment(), val); + allocAndInitData(getDataSpace(vid), ptr, len, location); } - template - void allocAndInitDataRandSign(T*& ptr, Size_type len, VariantID vid) + void allocAndInitDataConst(counting::pointer auto& ptr, counting::integral auto const& len, + auto const& val, VariantID vid, + std::source_location location = std::source_location::current()) { - rajaperf::allocAndInitDataRandSign(getDataSpace(vid), - ptr, len, getDataAlignment()); + allocAndInitDataConst(getDataSpace(vid), ptr, len, val, location); } - template - void allocAndInitDataRandValue(T*& ptr, Size_type len, VariantID vid) + void allocAndInitDataRandSign(counting::pointer auto& ptr, counting::integral auto const& len, VariantID vid, + std::source_location location = std::source_location::current()) { - rajaperf::allocAndInitDataRandValue(getDataSpace(vid), - ptr, len, getDataAlignment()); + allocAndInitDataRandSign(getDataSpace(vid), ptr, len, location); } - template - rajaperf::AutoDataMover allocDataForInit(T*& ptr, Size_type len, VariantID vid) + void allocAndInitDataRandValue(counting::pointer auto& ptr, counting::integral auto const& len, VariantID vid, + std::source_location location = std::source_location::current()) { - DataSpace ds = getDataSpace(vid); - DataSpace hds = rajaperf::hostCopyDataSpace(ds); - rajaperf::allocData(hds, ptr, len, getDataAlignment()); - return {ds, hds, ptr, len, getDataAlignment()}; + allocAndInitDataRandValue(getDataSpace(vid), ptr, len, location); } - template - rajaperf::AutoDataMover allocAndInitDataForInit(T*& ptr, Size_type len, VariantID vid) + auto allocDataForInit(counting::raw_pointer auto& ptr, counting::integral auto const& len, VariantID vid, + std::source_location location = std::source_location::current()) { - DataSpace ds = getDataSpace(vid); - DataSpace hds = rajaperf::hostCopyDataSpace(ds); - rajaperf::allocAndInitData(hds, ptr, len, getDataAlignment()); - return {ds, hds, ptr, len, getDataAlignment()}; + return allocDataForInit(getDataSpace(vid), ptr, len, location); } - template - rajaperf::AutoDataMover allocAndInitDataConstForInit(T*& ptr, Size_type len, T val, VariantID vid) + auto allocAndInitDataForInit(counting::raw_pointer auto& ptr, counting::integral auto const& len, VariantID vid, + std::source_location location = std::source_location::current()) { - DataSpace ds = getDataSpace(vid); - DataSpace hds = rajaperf::hostCopyDataSpace(ds); - rajaperf::allocAndInitDataConst(hds, ptr, len, getDataAlignment(), val); - return {ds, hds, ptr, len, getDataAlignment()}; + return allocAndInitDataForInit(getDataSpace(vid), ptr, len, location); } - template - rajaperf::AutoDataMover scopedMoveData(T*& ptr, Size_type len, VariantID vid) + auto allocAndInitDataConstForInit(counting::raw_pointer auto& ptr, counting::integral auto const& len, + auto const& val, VariantID vid, + std::source_location location = std::source_location::current()) { - DataSpace ds = getDataSpace(vid); - DataSpace hds = rajaperf::hostCopyDataSpace(ds); - rajaperf::moveData(hds, ds, ptr, len, getDataAlignment()); - return {ds, hds, ptr, len, getDataAlignment()}; + return allocAndInitDataConstForInit(getDataSpace(vid), ptr, len, val, location); } - template - void deallocData(T*& ptr, VariantID vid) + void deallocData(counting::pointer auto& ptr, VariantID vid, + std::source_location location = std::source_location::current()) { - rajaperf::deallocData(getDataSpace(vid), ptr); + deallocData(getDataSpace(vid), ptr, location); } template @@ -757,6 +953,10 @@ class KernelBase Index_type bytes_modify_written_per_rep; Index_type bytes_atomic_modify_written_per_rep; Index_type FLOPs_per_rep; + + bool enable_data_registration = false; + std::unique_ptr countingData; + double kernel_block_size = nan(""); // Set default value for non GPU kernels VariantID running_variant; diff --git a/src/common/RPTypes.hpp b/src/common/RPTypes.hpp index 94a483420..8ec78dd99 100644 --- a/src/common/RPTypes.hpp +++ b/src/common/RPTypes.hpp @@ -143,6 +143,32 @@ using Real_type = float; #endif +template < size_t N > +using Real_array = Real_type[N]; + +template < size_t N0, size_t N1 > +using Real_array2 = Real_type[N0][N1]; + +template < size_t N0, size_t N1, size_t N2 > +using Real_array3 = Real_type[N0][N1][N2]; + +template < size_t N0, size_t N1, size_t N2, size_t N3 > +using Real_array4 = Real_type[N0][N1][N2][N3]; + +template < size_t N > +using Real_array_ref = Real_type(&)[N]; +template < size_t N > +using Real_array_const_ref = const Real_type(&)[N]; + +template < size_t N0, size_t N1 > +using Real_array2_ref = Real_type(&)[N0][N1]; + +template < size_t N0, size_t N1, size_t N2 > +using Real_array3_ref = Real_type(&)[N0][N1][N2]; + +template < size_t N0, size_t N1, size_t N2, size_t N3 > +using Real_array4_ref = Real_type(&)[N0][N1][N2][N3]; + using Real_ptr = Real_type*; using Real_const_ptr = Real_type const *; /// @@ -157,6 +183,12 @@ using Complex_ptr = Complex_type*; #endif + +using Int_type_t = Int_type; +using Index_type_t = Index_type; +using Real_type_t = Real_type; + + #define RAJAPERF_STRINGIFY_HELPER(...) #__VA_ARGS__ #define RAJAPERF_STRINGIFY(...) RAJAPERF_STRINGIFY_HELPER(__VA_ARGS__) diff --git a/src/lcals/DIFF_PREDICT.cpp b/src/lcals/DIFF_PREDICT.cpp index 6e7687faa..3f13d2c47 100644 --- a/src/lcals/DIFF_PREDICT.cpp +++ b/src/lcals/DIFF_PREDICT.cpp @@ -77,5 +77,47 @@ void DIFF_PREDICT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_cx, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void DIFF_PREDICT::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + DIFF_PREDICT_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(DIFF_PREDICT_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/DIFF_PREDICT.hpp b/src/lcals/DIFF_PREDICT.hpp index 267d4aedf..2d864c50b 100644 --- a/src/lcals/DIFF_PREDICT.hpp +++ b/src/lcals/DIFF_PREDICT.hpp @@ -89,6 +89,7 @@ class DIFF_PREDICT : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/lcals/EOS.cpp b/src/lcals/EOS.cpp index 6d4776db3..6a0d01eed 100644 --- a/src/lcals/EOS.cpp +++ b/src/lcals/EOS.cpp @@ -90,5 +90,47 @@ void EOS::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_u, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void EOS::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + EOS_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(EOS_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/EOS.hpp b/src/lcals/EOS.hpp index 7c26049d8..37d539318 100644 --- a/src/lcals/EOS.hpp +++ b/src/lcals/EOS.hpp @@ -58,6 +58,7 @@ class EOS : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/lcals/FIRST_DIFF.cpp b/src/lcals/FIRST_DIFF.cpp index c15b972c7..80e341e84 100644 --- a/src/lcals/FIRST_DIFF.cpp +++ b/src/lcals/FIRST_DIFF.cpp @@ -81,5 +81,47 @@ void FIRST_DIFF::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void FIRST_DIFF::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + FIRST_DIFF_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(FIRST_DIFF_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_DIFF.hpp b/src/lcals/FIRST_DIFF.hpp index 595fc07cc..368f7d0af 100644 --- a/src/lcals/FIRST_DIFF.hpp +++ b/src/lcals/FIRST_DIFF.hpp @@ -48,6 +48,7 @@ class FIRST_DIFF : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/lcals/FIRST_MIN.cpp b/src/lcals/FIRST_MIN.cpp index 79ec6375b..0caa225b5 100644 --- a/src/lcals/FIRST_MIN.cpp +++ b/src/lcals/FIRST_MIN.cpp @@ -88,5 +88,57 @@ void FIRST_MIN::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_x, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void FIRST_MIN::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + FIRST_MIN_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + FIRST_MIN_MINLOC_INIT; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(FIRST_MIN_BODY); + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + m_minloc = mymin.loc; + ); + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_MIN.hpp b/src/lcals/FIRST_MIN.hpp index e5463edbb..56dea7afe 100644 --- a/src/lcals/FIRST_MIN.hpp +++ b/src/lcals/FIRST_MIN.hpp @@ -75,6 +75,7 @@ class FIRST_MIN : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineOpenMPTargetVariantTunings(); void defineKokkosVariantTunings(); diff --git a/src/lcals/FIRST_SUM.cpp b/src/lcals/FIRST_SUM.cpp index bb598bdd2..b865323fa 100644 --- a/src/lcals/FIRST_SUM.cpp +++ b/src/lcals/FIRST_SUM.cpp @@ -80,5 +80,47 @@ void FIRST_SUM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void FIRST_SUM::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 1; + const Index_type iend = getActualProblemSize(); + + FIRST_SUM_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(FIRST_SUM_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_SUM.hpp b/src/lcals/FIRST_SUM.hpp index 767244707..3b86e8e22 100644 --- a/src/lcals/FIRST_SUM.hpp +++ b/src/lcals/FIRST_SUM.hpp @@ -51,6 +51,7 @@ class FIRST_SUM : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/lcals/GEN_LIN_RECUR.cpp b/src/lcals/GEN_LIN_RECUR.cpp index ac8a13d02..59c7ebf8c 100644 --- a/src/lcals/GEN_LIN_RECUR.cpp +++ b/src/lcals/GEN_LIN_RECUR.cpp @@ -90,5 +90,49 @@ void GEN_LIN_RECUR::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) deallocData(m_sb, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void GEN_LIN_RECUR::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + GEN_LIN_RECUR_DATA_SETUP; + const Index_type iend = N+1; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type k = 0; k < N; ++k )) { + RAJAPERF_COUNTERS_LOOP_BODY(GEN_LIN_RECUR_OPT_BODY1); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 1; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(GEN_LIN_RECUR_OPT_BODY2); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/GEN_LIN_RECUR.hpp b/src/lcals/GEN_LIN_RECUR.hpp index 8a6b0133d..99aad4951 100644 --- a/src/lcals/GEN_LIN_RECUR.hpp +++ b/src/lcals/GEN_LIN_RECUR.hpp @@ -51,6 +51,20 @@ stb5[k] = b5[k+kb5i] - stb5[k]; +#define GEN_LIN_RECUR_OPT_BODY1 \ + Real_type tmp; \ + Real_type stb = stb5[k]; \ + b5[k+kb5i] = tmp = sa[k] + stb*sb[k]; \ + stb5[k] = tmp - stb; + +#define GEN_LIN_RECUR_OPT_BODY2 \ + Index_type k = N - i ; \ + Real_type tmp; \ + Real_type stb = stb5[k]; \ + b5[k+kb5i] = tmp = sa[k] + stb*sb[k]; \ + stb5[k] = tmp - stb; + + #include "common/KernelBase.hpp" namespace rajaperf @@ -72,6 +86,7 @@ class GEN_LIN_RECUR : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/lcals/HYDRO_1D.cpp b/src/lcals/HYDRO_1D.cpp index 87da8cba6..e81a9a8cd 100644 --- a/src/lcals/HYDRO_1D.cpp +++ b/src/lcals/HYDRO_1D.cpp @@ -86,5 +86,47 @@ void HYDRO_1D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_z, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void HYDRO_1D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + HYDRO_1D_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(HYDRO_1D_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/HYDRO_1D.hpp b/src/lcals/HYDRO_1D.hpp index d092bd58b..2dde13e52 100644 --- a/src/lcals/HYDRO_1D.hpp +++ b/src/lcals/HYDRO_1D.hpp @@ -53,6 +53,7 @@ class HYDRO_1D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/lcals/HYDRO_2D.cpp b/src/lcals/HYDRO_2D.cpp index 33922c035..3d009f893 100644 --- a/src/lcals/HYDRO_2D.cpp +++ b/src/lcals/HYDRO_2D.cpp @@ -124,5 +124,63 @@ void HYDRO_2D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_zz, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void HYDRO_2D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type kbeg = 1; + const Index_type kend = m_kn - 1; + const Index_type jbeg = 1; + const Index_type jend = m_jn - 1; + + HYDRO_2D_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type k = kbeg; k < kend; ++k )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = jbeg; j < jend; ++j )) { + RAJAPERF_COUNTERS_LOOP_BODY(HYDRO_2D_BODY1); + } + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type k = kbeg; k < kend; ++k )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = jbeg; j < jend; ++j )) { + RAJAPERF_COUNTERS_LOOP_BODY(HYDRO_2D_BODY2); + } + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type k = kbeg; k < kend; ++k )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = jbeg; j < jend; ++j )) { + RAJAPERF_COUNTERS_LOOP_BODY(HYDRO_2D_BODY3); + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/HYDRO_2D.hpp b/src/lcals/HYDRO_2D.hpp index af04e4bf4..8b108c715 100644 --- a/src/lcals/HYDRO_2D.hpp +++ b/src/lcals/HYDRO_2D.hpp @@ -147,6 +147,7 @@ class HYDRO_2D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/lcals/INT_PREDICT.cpp b/src/lcals/INT_PREDICT.cpp index 9e498a428..beed54f4f 100644 --- a/src/lcals/INT_PREDICT.cpp +++ b/src/lcals/INT_PREDICT.cpp @@ -86,5 +86,47 @@ void INT_PREDICT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_px, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void INT_PREDICT::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + INT_PREDICT_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(INT_PREDICT_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/INT_PREDICT.hpp b/src/lcals/INT_PREDICT.hpp index 2316f9197..1ba111124 100644 --- a/src/lcals/INT_PREDICT.hpp +++ b/src/lcals/INT_PREDICT.hpp @@ -68,6 +68,7 @@ class INT_PREDICT : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/lcals/PLANCKIAN.cpp b/src/lcals/PLANCKIAN.cpp index ca90930ba..3c9ebbf31 100644 --- a/src/lcals/PLANCKIAN.cpp +++ b/src/lcals/PLANCKIAN.cpp @@ -84,5 +84,47 @@ void PLANCKIAN::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_w, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void PLANCKIAN::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + PLANCKIAN_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(PLANCKIAN_OPT_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/PLANCKIAN.hpp b/src/lcals/PLANCKIAN.hpp index c978fa2ef..3b333c972 100644 --- a/src/lcals/PLANCKIAN.hpp +++ b/src/lcals/PLANCKIAN.hpp @@ -32,6 +32,12 @@ w[i] = x[i] / ( exp( y[i] ) - 1.0 ); +#define PLANCKIAN_OPT_BODY \ + Real_type tmp; \ + y[i] = tmp = u[i] / v[i]; \ + w[i] = x[i] / ( exp( tmp ) - 1.0 ); + + #include "common/KernelBase.hpp" namespace rajaperf @@ -53,6 +59,7 @@ class PLANCKIAN : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/lcals/TRIDIAG_ELIM.cpp b/src/lcals/TRIDIAG_ELIM.cpp index 07a40b1d6..433876897 100644 --- a/src/lcals/TRIDIAG_ELIM.cpp +++ b/src/lcals/TRIDIAG_ELIM.cpp @@ -84,5 +84,47 @@ void TRIDIAG_ELIM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_z, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void TRIDIAG_ELIM::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 1; + const Index_type iend = m_N; + + TRIDIAG_ELIM_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(TRIDIAG_ELIM_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/TRIDIAG_ELIM.hpp b/src/lcals/TRIDIAG_ELIM.hpp index 5206378f5..69ab07be9 100644 --- a/src/lcals/TRIDIAG_ELIM.hpp +++ b/src/lcals/TRIDIAG_ELIM.hpp @@ -53,6 +53,7 @@ class TRIDIAG_ELIM : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/polybench/POLYBENCH_2MM.cpp b/src/polybench/POLYBENCH_2MM.cpp index e6883fffd..d631f8130 100644 --- a/src/polybench/POLYBENCH_2MM.cpp +++ b/src/polybench/POLYBENCH_2MM.cpp @@ -110,5 +110,60 @@ void POLYBENCH_2MM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) deallocData(m_D, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_2MM::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_2MM_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < ni; i++ )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < nj; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_2MM_BODY1); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type k = 0; k < nk; k++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_2MM_BODY2); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_2MM_BODY3); + } + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < ni; i++)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type l = 0; l < nl; l++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_2MM_BODY4); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type j = 0; j < nj; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_2MM_BODY5); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_2MM_BODY6); + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_2MM.hpp b/src/polybench/POLYBENCH_2MM.hpp index 4236e6422..626b30e65 100644 --- a/src/polybench/POLYBENCH_2MM.hpp +++ b/src/polybench/POLYBENCH_2MM.hpp @@ -123,6 +123,7 @@ class POLYBENCH_2MM : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/polybench/POLYBENCH_3MM.cpp b/src/polybench/POLYBENCH_3MM.cpp index d68aa4aaf..0231b5d67 100644 --- a/src/polybench/POLYBENCH_3MM.cpp +++ b/src/polybench/POLYBENCH_3MM.cpp @@ -125,5 +125,70 @@ void POLYBENCH_3MM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) deallocData(m_G, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_3MM::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_3MM_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < ni; i++ )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < nj; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_3MM_BODY1); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type k = 0; k < nk; k++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_3MM_BODY2); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_3MM_BODY3); + } + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < nj; j++)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type l = 0; l < nl; l++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_3MM_BODY4); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type m = 0; m < nm; m++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_3MM_BODY5); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_3MM_BODY6); + } + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < ni; i++)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type l = 0; l < nl; l++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_3MM_BODY7); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type j = 0; j < nj; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_3MM_BODY8); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_3MM_BODY9); + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_3MM.hpp b/src/polybench/POLYBENCH_3MM.hpp index 1672a2769..0ad9b988f 100644 --- a/src/polybench/POLYBENCH_3MM.hpp +++ b/src/polybench/POLYBENCH_3MM.hpp @@ -149,6 +149,7 @@ class POLYBENCH_3MM : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/polybench/POLYBENCH_ADI.cpp b/src/polybench/POLYBENCH_ADI.cpp index 84394c039..a6bf0a8d2 100644 --- a/src/polybench/POLYBENCH_ADI.cpp +++ b/src/polybench/POLYBENCH_ADI.cpp @@ -94,5 +94,62 @@ void POLYBENCH_ADI::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) deallocData(m_Q, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_ADI::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_ADI_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 1; i < n-1; ++i)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ADI_OPT_BODY2); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type j = 1; j < n-1; ++j)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ADI_OPT_BODY3); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ADI_OPT_BODY4); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type k = n-2; k >= 1; --k)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ADI_OPT_BODY5); + } + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 1; i < n-1; ++i)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ADI_OPT_BODY6); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type j = 1; j < n-1; ++j)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ADI_OPT_BODY7); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ADI_OPT_BODY8); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type k = n-2; k >= 1; --k)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ADI_OPT_BODY9); + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_ADI.hpp b/src/polybench/POLYBENCH_ADI.hpp index 60d7aeca3..adbe594c6 100644 --- a/src/polybench/POLYBENCH_ADI.hpp +++ b/src/polybench/POLYBENCH_ADI.hpp @@ -123,6 +123,49 @@ U[i * n + k] = P[i * n + k] * U[i * n + k +1] + Q[i * n + k]; +#define POLYBENCH_ADI_OPT_BODY2 \ + V[0 * n + i] = 1.0; \ + Real_type last_P = 0.0; \ + Real_type last_Q = 1.0; \ + P[i * n + 0] = last_P; \ + Q[i * n + 0] = last_Q; + +#define POLYBENCH_ADI_OPT_BODY3 \ + Real_type tmp_div = a * last_P + b; \ + P[i * n + j] = last_P = -c / tmp_div; \ + Q[i * n + j] = last_Q = (-d * U[j * n + i-1] + (1.0 + 2.0*d) * U[j * n + i] - \ + f * U[j * n + i + 1] - a * last_Q) / \ + tmp_div; + +#define POLYBENCH_ADI_OPT_BODY4 \ + Real_type last_V = 1.0; \ + V[(n-1) * n + i] = last_V; + +#define POLYBENCH_ADI_OPT_BODY5 \ + V[k * n + i] = last_V = P[i * n + k] * last_V + Q[i * n + k]; + +#define POLYBENCH_ADI_OPT_BODY6 \ + U[i * n + 0] = 1.0; \ + Real_type last_P = 0.0; \ + Real_type last_Q = 1.0; \ + P[i * n + 0] = last_P; \ + Q[i * n + 0] = last_Q; + +#define POLYBENCH_ADI_OPT_BODY7 \ + Real_type tmp_div = d * last_P + e; \ + P[i * n + j] = last_P = -f / tmp_div; \ + Q[i * n + j] = last_Q = (-a * V[(i-1) * n + j] + (1.0 + 2.0*a) * V[i * n + j] - \ + c * V[(i + 1) * n + j] - d * last_Q) / \ + tmp_div; + +#define POLYBENCH_ADI_OPT_BODY8 \ + Real_type last_U = 1.0; \ + U[i * n + n-1] = last_U; + +#define POLYBENCH_ADI_OPT_BODY9 \ + U[i * n + k] = last_U = P[i * n + k] * last_U + Q[i * n + k]; + + #define POLYBENCH_ADI_BODY2_RAJA \ Vview(0, i) = 1.0; \ Pview(i, 0) = 0.0; \ @@ -190,6 +233,7 @@ class POLYBENCH_ADI : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/polybench/POLYBENCH_ATAX.cpp b/src/polybench/POLYBENCH_ATAX.cpp index 82b3f65ef..1b2032721 100644 --- a/src/polybench/POLYBENCH_ATAX.cpp +++ b/src/polybench/POLYBENCH_ATAX.cpp @@ -95,5 +95,56 @@ void POLYBENCH_ATAX::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx deallocData(m_A, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_ATAX::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_ATAX_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < N; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ATAX_BODY1); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type j = 0; j < N; ++j )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ATAX_BODY2); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ATAX_BODY3); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < N; ++j )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ATAX_BODY4); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type i = 0; i < N; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ATAX_BODY5); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ATAX_BODY6); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_ATAX.hpp b/src/polybench/POLYBENCH_ATAX.hpp index adbab3def..df094894d 100644 --- a/src/polybench/POLYBENCH_ATAX.hpp +++ b/src/polybench/POLYBENCH_ATAX.hpp @@ -111,6 +111,7 @@ class POLYBENCH_ATAX : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/polybench/POLYBENCH_FDTD_2D.cpp b/src/polybench/POLYBENCH_FDTD_2D.cpp index d45e4f2c4..40bddd98e 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D.cpp @@ -119,5 +119,61 @@ void POLYBENCH_FDTD_2D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ deallocData(m_hz, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_FDTD_2D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_FDTD_2D_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < ny; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_FDTD_2D_BODY1); + } + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 1; i < nx; i++)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < ny; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_FDTD_2D_BODY2); + } + } + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < nx; i++)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 1; j < ny; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_FDTD_2D_BODY3); + } + } + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < nx - 1; i++)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < ny - 1; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_FDTD_2D_BODY4); + } + } + + t = (t+1) % m_tsteps; + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_FDTD_2D.hpp b/src/polybench/POLYBENCH_FDTD_2D.hpp index abf6547f1..c39c43f4c 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.hpp +++ b/src/polybench/POLYBENCH_FDTD_2D.hpp @@ -108,6 +108,7 @@ class POLYBENCH_FDTD_2D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp index 99b6cab8b..f733f2159 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp @@ -81,5 +81,48 @@ void POLYBENCH_FLOYD_WARSHALL::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_AR deallocData(m_pout, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_FLOYD_WARSHALL::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_FLOYD_WARSHALL_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type k = 0; k < N; ++k)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < N; ++i)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < N; ++j)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_FLOYD_WARSHALL_BODY); + } + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp index 5153d5967..71d7739a9 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp @@ -72,6 +72,7 @@ class POLYBENCH_FLOYD_WARSHALL : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/polybench/POLYBENCH_GEMM.cpp b/src/polybench/POLYBENCH_GEMM.cpp index 9dbfdf51b..446ab5bb8 100644 --- a/src/polybench/POLYBENCH_GEMM.cpp +++ b/src/polybench/POLYBENCH_GEMM.cpp @@ -94,5 +94,51 @@ void POLYBENCH_GEMM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx deallocData(m_C, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_GEMM::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_GEMM_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < ni; ++i )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < nj; ++j )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMM_BODY1); + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMM_BODY2); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type k = 0; k < nk; ++k )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMM_BODY3); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMM_BODY4); + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_GEMM.hpp b/src/polybench/POLYBENCH_GEMM.hpp index 884316ec7..77be8d74d 100644 --- a/src/polybench/POLYBENCH_GEMM.hpp +++ b/src/polybench/POLYBENCH_GEMM.hpp @@ -100,6 +100,7 @@ class POLYBENCH_GEMM : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/polybench/POLYBENCH_GEMVER.cpp b/src/polybench/POLYBENCH_GEMVER.cpp index b8d160355..154c87cfc 100644 --- a/src/polybench/POLYBENCH_GEMVER.cpp +++ b/src/polybench/POLYBENCH_GEMVER.cpp @@ -126,5 +126,66 @@ void POLYBENCH_GEMVER::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i deallocData(m_z, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_GEMVER::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_GEMVER_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < n; i++ )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < n; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMVER_BODY1); + } + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < n; i++ )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMVER_BODY2); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type j = 0; j < n; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMVER_BODY3); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMVER_BODY4); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < n; i++ )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMVER_BODY5); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < n; i++ )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMVER_BODY6); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type j = 0; j < n; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMVER_BODY7); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMVER_BODY8); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_GEMVER.hpp b/src/polybench/POLYBENCH_GEMVER.hpp index 93712fdb7..c32b83a59 100644 --- a/src/polybench/POLYBENCH_GEMVER.hpp +++ b/src/polybench/POLYBENCH_GEMVER.hpp @@ -151,6 +151,7 @@ class POLYBENCH_GEMVER : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/polybench/POLYBENCH_GESUMMV.cpp b/src/polybench/POLYBENCH_GESUMMV.cpp index c73ae7fea..1cebfec26 100644 --- a/src/polybench/POLYBENCH_GESUMMV.cpp +++ b/src/polybench/POLYBENCH_GESUMMV.cpp @@ -91,5 +91,48 @@ void POLYBENCH_GESUMMV::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ deallocData(m_B, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_GESUMMV::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_GESUMMV_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < N; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GESUMMV_BODY1); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type j = 0; j < N; ++j )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GESUMMV_BODY2); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GESUMMV_BODY3); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_GESUMMV.hpp b/src/polybench/POLYBENCH_GESUMMV.hpp index 4819c0689..1a67187b4 100644 --- a/src/polybench/POLYBENCH_GESUMMV.hpp +++ b/src/polybench/POLYBENCH_GESUMMV.hpp @@ -94,6 +94,7 @@ class POLYBENCH_GESUMMV : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/polybench/POLYBENCH_HEAT_3D.cpp b/src/polybench/POLYBENCH_HEAT_3D.cpp index 42e975374..bdd299762 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D.cpp @@ -88,5 +88,58 @@ void POLYBENCH_HEAT_3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ deallocData(m_B, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_HEAT_3D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_HEAT_3D_DATA_SETUP; + + const Index_type ijkend = N-1; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 1; i < ijkend; ++i )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 1; j < ijkend; ++j )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type k = 1; k < ijkend; ++k )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_HEAT_3D_BODY1); + } + } + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 1; i < ijkend; ++i)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 1; j < ijkend; ++j )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type k = 1; k < ijkend; ++k )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_HEAT_3D_BODY2); + } + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, 0); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_HEAT_3D.hpp b/src/polybench/POLYBENCH_HEAT_3D.hpp index a60c36798..c6c7822ef 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.hpp +++ b/src/polybench/POLYBENCH_HEAT_3D.hpp @@ -108,6 +108,7 @@ class POLYBENCH_HEAT_3D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/polybench/POLYBENCH_JACOBI_1D.cpp b/src/polybench/POLYBENCH_JACOBI_1D.cpp index 751c400d3..693effa66 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.cpp @@ -93,5 +93,50 @@ void POLYBENCH_JACOBI_1D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun deallocData(m_B, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_JACOBI_1D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_JACOBI_1D_DATA_SETUP; + + const Index_type iend = N-1; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 1; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_JACOBI_1D_BODY1); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 1; i < iend; ++i)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_JACOBI_1D_BODY2); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, 0); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_JACOBI_1D.hpp b/src/polybench/POLYBENCH_JACOBI_1D.hpp index 0a21b3b9a..ff39f0478 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.hpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.hpp @@ -56,6 +56,7 @@ class POLYBENCH_JACOBI_1D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/polybench/POLYBENCH_JACOBI_2D.cpp b/src/polybench/POLYBENCH_JACOBI_2D.cpp index 7d4f2072a..642943311 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.cpp @@ -88,5 +88,54 @@ void POLYBENCH_JACOBI_2D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun deallocData(m_B, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_JACOBI_2D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_JACOBI_2D_DATA_SETUP; + + const Index_type ijend = N-1; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 1; i < ijend; ++i )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 1; j < ijend; ++j )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_JACOBI_2D_BODY1); + } + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 1; i < ijend; ++i)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 1; j < ijend; ++j)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_JACOBI_2D_BODY2); + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, 0); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_JACOBI_2D.hpp b/src/polybench/POLYBENCH_JACOBI_2D.hpp index f0f43fb57..1f61a6419 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.hpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.hpp @@ -75,6 +75,7 @@ class POLYBENCH_JACOBI_2D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/polybench/POLYBENCH_MVT.cpp b/src/polybench/POLYBENCH_MVT.cpp index 37ae5e4dc..9c48586d8 100644 --- a/src/polybench/POLYBENCH_MVT.cpp +++ b/src/polybench/POLYBENCH_MVT.cpp @@ -97,5 +97,56 @@ void POLYBENCH_MVT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) deallocData(m_A, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_MVT::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_MVT_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < N; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_MVT_BODY1); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type j = 0; j < N; ++j )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_MVT_BODY2); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_MVT_BODY3); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < N; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_MVT_BODY4); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type j = 0; j < N; ++j )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_MVT_BODY5); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_MVT_BODY6); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_MVT.hpp b/src/polybench/POLYBENCH_MVT.hpp index c9e0b5acd..d4fc87a13 100644 --- a/src/polybench/POLYBENCH_MVT.hpp +++ b/src/polybench/POLYBENCH_MVT.hpp @@ -115,6 +115,7 @@ class POLYBENCH_MVT : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/stream/ADD.cpp b/src/stream/ADD.cpp index 4882093b6..923bbb722 100644 --- a/src/stream/ADD.cpp +++ b/src/stream/ADD.cpp @@ -80,5 +80,47 @@ void ADD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_c, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void ADD::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + ADD_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(ADD_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/ADD.hpp b/src/stream/ADD.hpp index 1861a74b5..34a43b685 100644 --- a/src/stream/ADD.hpp +++ b/src/stream/ADD.hpp @@ -48,6 +48,7 @@ class ADD : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/stream/COPY.cpp b/src/stream/COPY.cpp index 6d80933eb..e1a262dc4 100644 --- a/src/stream/COPY.cpp +++ b/src/stream/COPY.cpp @@ -78,5 +78,47 @@ void COPY::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_c, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void COPY::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + COPY_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(COPY_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/COPY.hpp b/src/stream/COPY.hpp index 8a4f50617..ef5ab589b 100644 --- a/src/stream/COPY.hpp +++ b/src/stream/COPY.hpp @@ -47,6 +47,7 @@ class COPY : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/stream/DOT.cpp b/src/stream/DOT.cpp index 0e203d8e9..1e2ef8fd9 100644 --- a/src/stream/DOT.cpp +++ b/src/stream/DOT.cpp @@ -82,5 +82,57 @@ void DOT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_b, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void DOT::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + DOT_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + Real_type dot = m_dot_init; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(DOT_BODY); + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + m_dot += dot; + ); + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/DOT.hpp b/src/stream/DOT.hpp index 232d6661b..015c83ec6 100644 --- a/src/stream/DOT.hpp +++ b/src/stream/DOT.hpp @@ -47,6 +47,7 @@ class DOT : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineOpenMPTargetVariantTunings(); void defineKokkosVariantTunings(); diff --git a/src/stream/MUL.cpp b/src/stream/MUL.cpp index 65e4def13..edc7d88db 100644 --- a/src/stream/MUL.cpp +++ b/src/stream/MUL.cpp @@ -79,5 +79,47 @@ void MUL::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_c, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void MUL::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + MUL_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(MUL_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/MUL.hpp b/src/stream/MUL.hpp index d39bc1cba..3909773e7 100644 --- a/src/stream/MUL.hpp +++ b/src/stream/MUL.hpp @@ -48,6 +48,7 @@ class MUL : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings(); diff --git a/src/stream/TRIAD.cpp b/src/stream/TRIAD.cpp index 6af06619c..190beb8bb 100644 --- a/src/stream/TRIAD.cpp +++ b/src/stream/TRIAD.cpp @@ -81,5 +81,47 @@ void TRIAD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_c, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void TRIAD::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + TRIAD_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(TRIAD_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/TRIAD.hpp b/src/stream/TRIAD.hpp index a26bab846..3fb49e951 100644 --- a/src/stream/TRIAD.hpp +++ b/src/stream/TRIAD.hpp @@ -49,6 +49,7 @@ class TRIAD : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void defineSeqVariantTunings(); void defineOpenMPVariantTunings();