Index: openmp/trunk/README.rst =================================================================== --- openmp/trunk/README.rst +++ openmp/trunk/README.rst @@ -166,7 +166,7 @@ Create the Fortran modules (requires Fortran compiler). macOS* Fat Libraries -"""""""""""""""""" +"""""""""""""""""""" On macOS* machines, it is possible to build universal (or fat) libraries which include both i386 and x86_64 architecture objects in a single archive. @@ -254,6 +254,40 @@ Path of the folder that contains ``libomp.so``. This is required for testing out-of-tree builds. +Options for ``NVPTX device RTL`` +-------------------------------- + +**LIBOMPTARGET_NVPTX_ENABLE_BCLIB** = ``OFF|ON`` + Enable CUDA LLVM bitcode offloading device RTL. This is used for link time + optimization of the OMP runtime and application code. + +**LIBOMPTARGET_NVPTX_CUDA_COMPILER** = ``""`` + Location of a CUDA compiler capable of emitting LLVM bitcode. Currently only + the Clang compiler is supported. This is only used when building the CUDA LLVM + bitcode offloading device RTL. If unspecified and the CMake C compiler is + Clang, then Clang is used. + +**LIBOMPTARGET_NVPTX_BC_LINKER** = ``""`` + Location of a linker capable of linking LLVM bitcode objects. This is only + used when building the CUDA LLVM bitcode offloading device RTL. If unspecified + and the CMake C compiler is Clang and there exists a llvm-link binary in the + directory containing Clang, then this llvm-link binary is used. + +**LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER** = ``""`` + Host compiler to use with NVCC. This compiler is not going to be used to + produce any binary. Instead, this is used to overcome the input compiler + checks done by NVCC. E.g. if using a default host compiler that is not + compatible with NVCC, this option can be use to pass to NVCC a valid compiler + to avoid the error. + + **LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY** = ``35`` + CUDA compute capability that should be supported by the NVPTX device RTL. E.g. + for compute capability 6.0, the option "60" should be used. Compute capability + 3.5 is the minimum required. + + **LIBOMPTARGET_NVPTX_DEBUG** = ``OFF|ON`` + Enable printing of debug messages from the NVPTX device RTL. + Example Usages of CMake ======================= Index: openmp/trunk/libomptarget/CMakeLists.txt =================================================================== --- openmp/trunk/libomptarget/CMakeLists.txt +++ openmp/trunk/libomptarget/CMakeLists.txt @@ -67,6 +67,7 @@ # Build offloading plugins and device RTLs if they are available. add_subdirectory(plugins) +add_subdirectory(deviceRTLs) # Add tests. add_subdirectory(test) Index: openmp/trunk/libomptarget/deviceRTLs/CMakeLists.txt =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/CMakeLists.txt +++ openmp/trunk/libomptarget/deviceRTLs/CMakeLists.txt @@ -0,0 +1,14 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +# ##===----------------------------------------------------------------------===## +# +# Build a device RTL for each available machine available. +# +##===----------------------------------------------------------------------===## + +add_subdirectory(nvptx) Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/CMakeLists.txt =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/CMakeLists.txt +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/CMakeLists.txt @@ -0,0 +1,200 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build the NVPTX (CUDA) Device RTL if the CUDA tools are available +# +##===----------------------------------------------------------------------===## + +set(LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER "" CACHE STRING + "Path to alternate NVCC host compiler to be used by the NVPTX device RTL.") + +if(LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER) + find_program(ALTERNATE_CUDA_HOST_COMPILER NAMES ${LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER}) + if(NOT ALTERNATE_CUDA_HOST_COMPILER) + libomptarget_say("Not building CUDA offloading device RTL: invalid NVPTX alternate host compiler.") + endif() + set(CUDA_HOST_COMPILER ${ALTERNATE_CUDA_HOST_COMPILER} CACHE FILEPATH "" FORCE) +endif() + +# We can't use clang as nvcc host preprocessor, so we attempt to replace it with +# gcc. +if(CUDA_HOST_COMPILER MATCHES clang) + + find_program(LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER NAMES gcc) + + if(NOT LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER) + libomptarget_say("Not building CUDA offloading device RTL: clang is not supported as NVCC host compiler.") + libomptarget_say("Please include gcc in your path or set LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER to the full path of of valid compiler.") + return() + endif() + set(CUDA_HOST_COMPILER "${LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER}" CACHE FILEPATH "" FORCE) +endif() + +if(LIBOMPTARGET_DEP_CUDA_FOUND) + libomptarget_say("Building CUDA offloading device RTL.") + + # We really don't have any host code, so we don't need to care about + # propagating host flags. + set(CUDA_PROPAGATE_HOST_FLAGS OFF) + + set(cuda_src_files + src/cancel.cu + src/critical.cu + src/data_sharing.cu + src/libcall.cu + src/loop.cu + src/omptarget-nvptx.cu + src/parallel.cu + src/reduction.cu + src/sync.cu + src/task.cu + ) + + set(omp_data_objects src/omp_data.cu) + + # Get the compute capability the user requested or use SM_35 by default. + # SM_35 is what clang uses by default. + set(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY 35 CACHE STRING + "CUDA Compute Capability to be used to compile the NVPTX device RTL.") + set(CUDA_ARCH -arch sm_${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY}) + + # Activate RTL message dumps if requested by the user. + set(LIBOMPTARGET_NVPTX_DEBUG FALSE CACHE BOOL + "Activate NVPTX device RTL debug messages.") + if(${LIBOMPTARGET_NVPTX_DEBUG}) + set(CUDA_DEBUG -DOMPTARGET_NVPTX_DEBUG=-1 -g --ptxas-options=-v) + endif() + + # NVPTX runtime library has to be statically linked. Dynamic linking is not + # yet supported by the CUDA toolchain on the device. + set(BUILD_SHARED_LIBS OFF) + set(CUDA_SEPARABLE_COMPILATION ON) + + cuda_add_library(omptarget-nvptx STATIC ${cuda_src_files} ${omp_data_objects} + OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG}) + + # Install device RTL under the lib destination folder. + install(TARGETS omptarget-nvptx ARCHIVE DESTINATION "lib") + + target_link_libraries(omptarget-nvptx ${CUDA_LIBRARIES}) + + # Check if we can create an LLVM bitcode implementation of the runtime library + # that could be inlined in the user implementation. + set(LIBOMPTARGET_NVPTX_ENABLE_BCLIB FALSE CACHE BOOL + "Enable CUDA LLVM bitcode offloading device RTL.") + if (${LIBOMPTARGET_NVPTX_ENABLE_BCLIB}) + + # Find a clang compiler capable of compiling cuda files to LLVM bitcode and + # an LLVM linker. + # We use the one provided by the user, attempt to use the one used to build + # libomptarget or just fail. + + set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING + "Location of a CUDA compiler capable of emitting LLVM bitcode.") + set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING + "Location of a linker capable of linking LLVM bitcode objects.") + + if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "") + set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${LIBOMPTARGET_NVPTX_CUDA_COMPILER}) + elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang") + set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${CMAKE_C_COMPILER}) + else() + libomptarget_error_say("Cannot find a CUDA compiler capable of emitting LLVM bitcode.") + libomptarget_error_say("Please configure with flag -DLIBOMPTARGET_NVPTX_CUDA_COMPILER") + endif() + + # Get compiler directory to try to locate a suitable linker + get_filename_component(COMPILER_DIR ${CMAKE_C_COMPILER} DIRECTORY) + + if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "") + set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER ${LIBOMPTARGET_NVPTX_BC_LINKER}) + elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang" AND EXISTS "${COMPILER_DIR}/llvm-link") + # Use llvm-link from the directory containing clang + set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER ${COMPILER_DIR}/llvm-link) + else() + libomptarget_error_say("Cannot find a linker capable of linking LLVM bitcode objects.") + libomptarget_error_say("Please configure with flag -DLIBOMPTARGET_NVPTX_BC_LINKER") + endif() + + if(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER AND LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER) + libomptarget_say("Building CUDA LLVM bitcode offloading device RTL.") + + # Decide which ptx version to use. Same choices as Clang. + if(CUDA_VERSION_MAJOR GREATER 9 OR CUDA_VERSION_MAJOR EQUAL 9) + set(CUDA_PTX_VERSION ptx60) + else() + set(CUDA_PTX_VERSION ptx42) + endif() + + # Set flags for Clang cuda compilation. Only Clang is supported because there is + # no other compiler capable of generating bitcode from cuda sources. + set(CUDA_FLAGS + -emit-llvm + -O1 + -Xclang -target-feature + -Xclang +${CUDA_PTX_VERSION} + --cuda-device-only + -DOMPTARGET_NVPTX_TEST=0 -DOMPTARGET_NVPTX_DEBUG=0 + ) + + # CUDA 9 header files use the nv_weak attribute which clang is not yet prepared + # to handle. Therefore, we use 'weak' instead. We are compiling only for the + # device, so it should be equivalent. + if(CUDA_VERSION_MAJOR EQUAL 9) + set(CUDA_FLAGS ${CUDA_FLAGS} -Dnv_weak=weak) + endif() + + # Get the compute capability the user requested or use SM_35 by default. + set(CUDA_ARCH "") + set(CUDA_ARCH --cuda-gpu-arch=sm_${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY}) + + # Compile cuda files to bitcode. + set(bc_files "") + foreach(src ${cuda_src_files}) + get_filename_component(infile ${src} ABSOLUTE) + get_filename_component(outfile ${src} NAME) + + add_custom_command(OUTPUT ${outfile}.bc + COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${CUDA_FLAGS} ${CUDA_ARCH} ${CUDA_INCLUDES} + -c ${infile} -o ${outfile}.bc + DEPENDS ${infile} + IMPLICIT_DEPENDS CXX ${infile} + COMMENT "Building LLVM bitcode ${outfile}.bc" + VERBATIM + ) + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}.bc) + + list(APPEND bc_files ${outfile}.bc) + endforeach() + + # Link to a bitcode library. + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx.bc + COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER} + -o ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx.bc ${bc_files} + DEPENDS ${bc_files} + COMMENT "Linking LLVM bitcode libomptarget-nvptx.bc" + ) + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES libomptarget-nvptx.bc) + + add_custom_target(omptarget-nvptx-bc ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx.bc) + + # Copy library to destination. + add_custom_command(TARGET omptarget-nvptx-bc POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx.bc + $) + + # Install device RTL under the lib destination folder. + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx.bc DESTINATION "lib") + + endif() + endif() + +else() + libomptarget_say("Not building CUDA offloading device RTL: CUDA tools not found in the system.") +endif() Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt @@ -0,0 +1,523 @@ + +**Design document for OpenMP reductions on the GPU** + +//Abstract: //In this document we summarize the new design for an OpenMP +implementation of reductions on NVIDIA GPUs. This document comprises +* a succinct background review, +* an introduction to the decoupling of reduction algorithm and + data-structure-specific processing routines, +* detailed illustrations of reduction algorithms used and +* a brief overview of steps we have made beyond the last implementation. + +**Problem Review** + +Consider a typical OpenMP program with reduction pragma. + +``` + double foo, bar; + #pragma omp parallel for reduction(+:foo, bar) + for (int i = 0; i < N; i++) { + foo+=A[i]; bar+=B[i]; + } +``` +where 'foo' and 'bar' are reduced across all threads in the parallel region. +Our primary goal is to efficiently aggregate the values of foo and bar in +such manner that +* makes the compiler logically concise. +* efficiently reduces within warps, threads, blocks and the device. + +**Introduction to Decoupling** +In this section we address the problem of making the compiler +//logically concise// by partitioning the task of reduction into two broad +categories: data-structure specific routines and algorithmic routines. + +The previous reduction implementation was highly coupled with +the specificity of the reduction element data structures (e.g., sizes, data +types) and operators of the reduction (e.g., addition, multiplication). In +our implementation we strive to decouple them. In our final implementations, +we could remove all template functions in our runtime system. + +The (simplified) pseudo code generated by LLVM is as follows: + +``` + 1. Create private copies of variables: foo_p, bar_p + 2. Each thread reduces the chunk of A and B assigned to it and writes + to foo_p and bar_p respectively. + 3. ret = kmpc_nvptx_reduce_nowait(..., reduceData, shuffleReduceFn, + interWarpCpyFn) + where: + struct ReduceData { + double *foo; + double *bar; + } reduceData + reduceData.foo = &foo_p + reduceData.bar = &bar_p + + shuffleReduceFn and interWarpCpyFn are two auxiliary functions + generated to aid the runtime performing algorithmic steps + while being data-structure agnostic about ReduceData. + + In particular, shuffleReduceFn is a function that takes the following + inputs: + a. local copy of ReduceData + b. its lane_id + c. the offset of the lane_id which hosts a remote ReduceData + relative to the current one + d. an algorithm version paramter determining which reduction + algorithm to use. + This shuffleReduceFn retrieves the remote ReduceData through shuffle + intrinsics and reduces, using the algorithm specified by the 4th + parameter, the local ReduceData and with the remote ReduceData element + wise, and places the resultant values into the local ReduceData. + + Different reduction algorithms are implemented with different runtime + functions, but they all make calls to this same shuffleReduceFn to + perform the essential reduction step. Therefore, based on the 4th + parameter, this shuffleReduceFn will behave slightly differently to + cooperate with the runtime function to ensure correctness under + different circumstances. + + InterWarpCpyFn, as the name suggests, is a function that copies data + across warps. Its function is to tunnel all the thread private + ReduceData that is already reduced within a warp to a lane in the first + warp with minimal shared memory footprint. This is an essential step to + prepare for the last step of a block reduction. + + (Warp, block, device level reduction routines that utilize these + auxiliary functions will be discussed in the next section.) + + 4. if ret == 1: + The master thread stores the reduced result in the globals. + foo += reduceData.foo; bar += reduceData.bar +``` + +**Reduction Algorithms** + +On the warp level, we have three versions of the algorithms: + +1. Full Warp Reduction + +``` +gpu_regular_warp_reduce(void *reduce_data, + kmp_ShuffleReductFctPtr ShuffleReduceFn) { + for (int offset = WARPSIZE/2; offset > 0; offset /= 2) + ShuffleReduceFn(reduce_data, 0, offset, 0); +} +``` +ShuffleReduceFn is used here with lane_id set to 0 because it is not used +therefore we save instructions by not retrieving lane_id from the corresponding +special registers. The 4th parameters, which represents the version of the +algorithm being used here, is set to 0 to signify full warp reduction. + +In this version specified (=0), the ShuffleReduceFn behaves, per element, as +follows: + +``` +//reduce_elem refers to an element in the local ReduceData +//remote_elem is retrieved from a remote lane +remote_elem = shuffle_down(reduce_elem, offset, 32); +reduce_elem = reduce_elem @ remote_elem; + +``` + +An illustration of this algorithm operating on a hypothetical 8-lane full-warp +would be: +{F74} +The coloring invariant follows that elements with the same color will be +combined and reduced in the next reduction step. As can be observed, no overhead +is present, exactly log(2, N) steps are needed. + +2. Contiguous Full Warp Reduction +``` +gpu_irregular_warp_reduce(void *reduce_data, + kmp_ShuffleReductFctPtr ShuffleReduceFn, int size, + int lane_id) { + int curr_size; + int offset; + curr_size = size; + mask = curr_size/2; + while (offset>0) { + ShuffleReduceFn(reduce_data, lane_id, offset, 1); + curr_size = (curr_size+1)/2; + offset = curr_size/2; + } +} +``` + +In this version specified (=1), the ShuffleReduceFn behaves, per element, as +follows: +``` +//reduce_elem refers to an element in the local ReduceData +//remote_elem is retrieved from a remote lane +remote_elem = shuffle_down(reduce_elem, offset, 32); +if (lane_id < offset) { + reduce_elem = reduce_elem @ remote_elem +} else { + reduce_elem = remote_elem +} +``` + +An important invariant (also a restriction on the starting state of the +reduction) is that this algorithm assumes that all unused ReduceData are +located in a contiguous subset of threads in a warp starting from lane 0. + +With the presence of a trailing active lane with an odd-numbered lane +id, its value will not be aggregated with any other lane. Therefore, +in order to preserve the invariant, such ReduceData is copied to the first lane +whose thread-local ReduceData has already being used in a previous reduction +and would therefore be useless otherwise. + +An illustration of this algorithm operating on a hypothetical 8-lane partial +warp woud be: +{F75} + +As illustrated, this version of the algorithm introduces overhead whenever +we have odd number of participating lanes in any reduction step to +copy data between lanes. + +3. Dispersed Partial Warp Reduction +``` +gpu_irregular_simt_reduce(void *reduce_data, + kmp_ShuffleReductFctPtr ShuffleReduceFn) { + int size, remote_id; + int logical_lane_id = find_number_of_dispersed_active_lanes_before_me() * 2; + do { + remote_id = find_the_next_active_lane_id_right_after_me(); + // the above function returns 0 of no active lane + // is present right after the current thread. + size = get_number_of_active_lanes_in_this_warp(); + logical_lane_id /= 2; + ShuffleReduceFn(reduce_data, logical_lane_id, remote_id-1-threadIdx.x, 2); + } while (logical_lane_id % 2 == 0 && size > 1); +``` + +There is no assumption made about the initial state of the reduction. +Any number of lanes (>=1) could be active at any position. The reduction +result is kept in the first active lane. + +In this version specified (=2), the ShuffleReduceFn behaves, per element, as +follows: +``` +//reduce_elem refers to an element in the local ReduceData +//remote_elem is retrieved from a remote lane +remote_elem = shuffle_down(reduce_elem, offset, 32); +if (LaneId % 2 == 0 && Offset > 0) { + reduce_elem = reduce_elem @ remote_elem +} else { + reduce_elem = remote_elem +} +``` +We will proceed with a brief explanation for some arguments passed in, +it is important to notice that, in this section, we will introduce the +concept of logical_lane_id, and it is important to distinguish it +from physical lane_id as defined by nvidia. +1. //logical_lane_id//: as the name suggests, it refers to the calculated + lane_id (instead of the physical one defined by nvidia) that would make + our algorithm logically concise. A thread with logical_lane_id k means + there are (k-1) threads before it. +2. //remote_id-1-threadIdx.x//: remote_id is indeed the nvidia-defined lane + id of the remote lane from which we will retrieve the ReduceData. We + subtract (threadIdx+1) from it because we would like to maintain only one + underlying shuffle intrinsic (which is used to communicate among lanes in a + warp). This particular version of shuffle intrinsic we take accepts only + offsets, instead of absolute lane_id. Therefore the subtraction is performed + on the absolute lane_id we calculated to obtain the offset. + +This algorithm is slightly different in 2 ways and it is not, conceptually, a +generalization of the above algorithms. +1. It reduces elements close to each other. For instance, values in the 0th lane + is to be combined with that of the 1st lane; values in the 2nd lane is to be + combined with that of the 3rd lane. We did not use the previous algorithm + where the first half of the (partial) warp is reduced with the second half + of the (partial) warp. This is because, the mapping + f(x): logical_lane_id -> physical_lane_id; + can be easily calculated whereas its inverse + f^-1(x): physical_lane_id -> logical_lane_id + cannot and performing such reduction requires the inverse to be known. +2. Because this algorithm is agnostic about the positions of the lanes that are + active, we do not need to perform the coping step as in the second + algorithm. +An illustrative run would look like +{F76} +As observed, overhead is high because in each and every step of reduction, +logical_lane_id is recalculated; so is the remote_id. + +On a block level, we have implemented the following block reduce algorithm: + +``` +gpu_irregular_block_reduce(void *reduce_data, + kmp_ShuffleReductFctPtr shuflReduceFn, + kmp_InterWarpCopyFctPtr interWarpCpyFn, + int size) { + + int wid = threadIdx.x/WARPSIZE; + int lane_id = threadIdx.x%WARPSIZE; + + int warp_needed = (size+WARPSIZE-1)/WARPSIZE; //ceiling of division + + unsigned tnum = __ballot(1); + int thread_num = __popc(tnum); + + //full warp reduction + if (thread_num == WARPSIZE) { + gpu_regular_warp_reduce(reduce_data, shuflReduceFn); + } + //partial warp reduction + if (thread_num < WARPSIZE) { + gpu_irregular_warp_reduce(reduce_data, shuflReduceFn, thread_num, + lane_id); + } + //Gather all the reduced values from each warp + //to the first warp + //named_barrier inside this function to ensure + //correctness. It is effectively a sync_thread + //that won't deadlock. + interWarpCpyFn(reduce_data, warp_needed); + + //This is to reduce data gathered from each "warp master". + if (wid==0) { + gpu_irregular_warp_reduce(reduce_data, shuflReduceFn, warp_needed, + lane_id); + } + + return; +} +``` +In this function, no ShuffleReduceFn is directly called as it makes calls +to various versions of the warp-reduction functions. It first reduces +ReduceData warp by warp; in the end, we end up with the number of +ReduceData equal to the number of warps present in this thread +block. We then proceed to gather all such ReduceData to the first warp. + +As observed, in this algorithm we make use of the function InterWarpCpyFn, +which copies data from each of the "warp master" (0th lane of each warp, where +a warp-reduced ReduceData is held) to the 0th warp. This step reduces (in a +mathematical sense) the problem of reduction across warp masters in a block to +the problem of warp reduction which we already have solutions to. + +We can thus completely avoid the use of atomics to reduce in a threadblock. + +**Efficient Cross Block Reduce** + +The next challenge is to reduce values across threadblocks. We aim to do this +without atomics or critical sections. + +Let a kernel be started with TB threadblocks. +Let the GPU have S SMs. +There can be at most N active threadblocks per SM at any time. + +Consider a threadblock tb (tb < TB) running on SM s (s < SM). 'tb' is one of +at most 'N' active threadblocks on SM s. Let each threadblock active on an SM +be given an instance identifier id (0 <= id < N). Therefore, the tuple (s, id) +uniquely identifies an active threadblock on the GPU. + +To efficiently implement cross block reduce, we first allocate an array for +each value to be reduced of size S*N (which is the maximum number of active +threadblocks at any time on the device). + +Each threadblock reduces its value to slot [s][id]. This can be done without +locking since no other threadblock can write to the same slot concurrently. + +As a final stage, we reduce the values in the array as follows: + +``` +// Compiler generated wrapper function for each target region with a reduction +clause. +target_function_wrapper(map_args, reduction_array) <--- start with 1 team and 1 + thread. + // Use dynamic parallelism to launch M teams, N threads as requested by the + user to execute the target region. + + target_function<>(map_args) + + Reduce values in reduction_array + +``` + +**Comparison with Last Version** + + +The (simplified) pseudo code generated by LLVM on the host is as follows: + + +``` + 1. Create private copies of variables: foo_p, bar_p + 2. Each thread reduces the chunk of A and B assigned to it and writes + to foo_p and bar_p respectively. + 3. ret = kmpc_reduce_nowait(..., reduceData, reduceFn, lock) + where: + struct ReduceData { + double *foo; + double *bar; + } reduceData + reduceData.foo = &foo_p + reduceData.bar = &bar_p + + reduceFn is a pointer to a function that takes in two inputs + of type ReduceData, "reduces" them element wise, and places the + result in the first input: + reduceFn(ReduceData *a, ReduceData *b) + a = a @ b + + Every thread in the parallel region calls kmpc_reduce_nowait with + its private copy of reduceData. The runtime reduces across the + threads (using tree reduction on the operator 'reduceFn?) and stores + the final result in the master thread if successful. + 4. if ret == 1: + The master thread stores the reduced result in the globals. + foo += reduceData.foo; bar += reduceData.bar + 5. else if ret == 2: + In this case kmpc_reduce_nowait() could not use tree reduction, + so use atomics instead: + each thread atomically writes to foo + each thread atomically writes to bar +``` + +On a GPU, a similar reduction may need to be performed across SIMT threads, +warps, and threadblocks. The challenge is to do so efficiently in a fashion +that is compatible with the LLVM OpenMP implementation. + +In the previously released 0.1 version of the LLVM OpenMP compiler for GPUs, +the salient steps of the code generated are as follows: + + +``` + 1. Create private copies of variables: foo_p, bar_p + 2. Each thread reduces the chunk of A and B assigned to it and writes + to foo_p and bar_p respectively. + 3. ret = kmpc_reduce_nowait(..., reduceData, reduceFn, lock) + status = can_block_reduce() + if status == 1: + reduce efficiently to thread 0 using shuffles and shared memory. + return 1 + else + cannot use efficient block reduction, fallback to atomics + return 2 + 4. if ret == 1: + The master thread stores the reduced result in the globals. + foo += reduceData.foo; bar += reduceData.bar + 5. else if ret == 2: + In this case kmpc_reduce_nowait() could not use tree reduction, + so use atomics instead: + each thread atomically writes to foo + each thread atomically writes to bar +``` + +The function can_block_reduce() is defined as follows: + + +``` +int32_t can_block_reduce() { + int tid = GetThreadIdInTeam(); + int nt = GetNumberOfOmpThreads(tid); + if (nt != blockDim.x) + return 0; + unsigned tnum = __ballot(1); + if (tnum != (~0x0)) { + return 0; + } + return 1; +} +``` + +This function permits the use of the efficient block reduction algorithm +using shuffles and shared memory (return 1) only if (a) all SIMT threads in +a warp are active (i.e., number of threads in the parallel region is a +multiple of 32) and (b) the number of threads in the parallel region +(set by the num_threads clause) equals blockDim.x. + +If either of these preconditions is not true, each thread in the threadblock +updates the global value using atomics. + +Atomics and compare-and-swap operations are expensive on many threaded +architectures such as GPUs and we must avoid them completely. + + +**Appendix: Implementation Details** + + +``` +// Compiler generated function. +reduceFn(ReduceData *a, ReduceData *b) + a->foo = a->foo + b->foo + a->bar = a->bar + b->bar + +// Compiler generated function. +swapAndReduceFn(ReduceData *thread_private, int lane) + ReduceData *remote = new ReduceData() + remote->foo = shuffle_double(thread_private->foo, lane) + remote->bar = shuffle_double(thread_private->bar, lane) + reduceFn(thread_private, remote) + +// OMP runtime function. +warpReduce_regular(ReduceData *thread_private, Fn *swapAndReduceFn): + offset = 16 + while (offset > 0) + swapAndReduceFn(thread_private, offset) + offset /= 2 + +// OMP runtime function. +warpReduce_irregular(): + ... + +// OMP runtime function. +kmpc_reduce_warp(reduceData, swapAndReduceFn) + if all_lanes_active: + warpReduce_regular(reduceData, swapAndReduceFn) + else: + warpReduce_irregular(reduceData, swapAndReduceFn) + if in_simd_region: + // all done, reduce to global in simd lane 0 + return 1 + else if in_parallel_region: + // done reducing to one value per warp, now reduce across warps + return 3 + +// OMP runtime function; one for each basic type. +kmpc_reduce_block_double(double *a) + if lane == 0: + shared[wid] = *a + named_barrier(1, num_threads) + if wid == 0 + block_reduce(shared) + if lane == 0 + *a = shared[0] + named_barrier(1, num_threads) + if wid == 0 and lane == 0 + return 1 // write back reduced result + else + return 0 // don't do anything + +``` + + + +``` +// Compiler generated code. + 1. Create private copies of variables: foo_p, bar_p + 2. Each thread reduces the chunk of A and B assigned to it and writes + to foo_p and bar_p respectively. + 3. ret = kmpc_reduce_warp(reduceData, swapAndReduceFn) + 4. if ret == 1: + The master thread stores the reduced result in the globals. + foo += reduceData.foo; bar += reduceData.bar + 5. else if ret == 3: + ret = block_reduce_double(reduceData.foo) + if ret == 1: + foo += reduceData.foo + ret = block_reduce_double(reduceData.bar) + if ret == 1: + bar += reduceData.bar +``` + +**Notes** + + 1. This scheme requires that the CUDA OMP runtime can call llvm generated + functions. This functionality now works. + 2. If the user inlines the CUDA OMP runtime bitcode, all of the machinery + (including calls through function pointers) are optimized away. + 3. If we are reducing multiple to multiple variables in a parallel region, + the reduce operations are all performed in warpReduce_[ir]regular(). This + results in more instructions in the loop and should result in fewer + stalls due to data dependencies. Unfortunately we cannot do the same in + kmpc_reduce_block_double() without increasing shared memory usage. Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/cancel.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/cancel.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/cancel.cu @@ -0,0 +1,28 @@ +//===------ cancel.cu - NVPTX OpenMP cancel interface ------------ CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Interface to be used in the implementation of OpenMP cancel. +// +//===----------------------------------------------------------------------===// + +#include "omptarget-nvptx.h" + +EXTERN int32_t __kmpc_cancellationpoint(kmp_Indent *loc, int32_t global_tid, + int32_t cancelVal) { + PRINT(LD_IO, "call kmpc_cancellationpoint(cancel val %d)\n", cancelVal); + // disabled + return FALSE; +} + +EXTERN int32_t __kmpc_cancel(kmp_Indent *loc, int32_t global_tid, + int32_t cancelVal) { + PRINT(LD_IO, "call kmpc_cancel(cancel val %d)\n", cancelVal); + // disabled + return FALSE; +} Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/counter_group.h =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/counter_group.h +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/counter_group.h @@ -0,0 +1,51 @@ +//===------ counter_group.h - NVPTX OpenMP loop scheduling ------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Interface to implement OpenMP loop scheduling +// +//===----------------------------------------------------------------------===// + +#ifndef _OMPTARGET_NVPTX_COUNTER_GROUP_H_ +#define _OMPTARGET_NVPTX_COUNTER_GROUP_H_ + +#include "option.h" + +// counter group type for synchronizations +class omptarget_nvptx_CounterGroup { +public: + // getters and setters + INLINE Counter &Event() { return v_event; } + INLINE volatile Counter &Start() { return v_start; } + INLINE Counter &Init() { return v_init; } + + // Synchronization Interface + + INLINE void Clear(); // first time start=event + INLINE void Reset(); // init = first + INLINE void Init(Counter &priv); // priv = init + INLINE Counter Next(); // just counts number of events + + // set priv to n, to be used in later waitOrRelease + INLINE void Complete(Counter &priv, Counter n); + + // check priv and decide if we have to wait or can free the other warps + INLINE void Release(Counter priv, Counter current_event_value); + INLINE void WaitOrRelease(Counter priv, Counter current_event_value); + +private: + Counter v_event; // counter of events (atomic) + + // volatile is needed to force loads to read from global + // memory or L2 cache and see the write by the last master + volatile Counter v_start; // signal when events registered are finished + + Counter v_init; // used to initialize local thread variables +}; + +#endif /* SRC_COUNTER_GROUP_H_ */ Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/counter_groupi.h =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/counter_groupi.h +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/counter_groupi.h @@ -0,0 +1,82 @@ +//===----- counter_groupi.h - NVPTX OpenMP loop scheduling ------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Interface implementation for OpenMP loop scheduling +// +//===----------------------------------------------------------------------===// + +#include "option.h" + +INLINE void omptarget_nvptx_CounterGroup::Clear() { + PRINT0(LD_SYNCD, "clear counters\n") + v_event = 0; + v_start = 0; + // v_init does not need to be reset (its value is dead) +} + +INLINE void omptarget_nvptx_CounterGroup::Reset() { + // done by master before entering parallel + ASSERT(LT_FUSSY, v_event == v_start, + "error, entry %lld !=start %lld at reset\n", P64(v_event), + P64(v_start)); + v_init = v_start; +} + +INLINE void omptarget_nvptx_CounterGroup::Init(Counter &priv) { + PRINT(LD_SYNCD, "init priv counter 0x%llx with val %lld\n", P64(&priv), + P64(v_start)); + priv = v_start; +} + +// just counts number of events +INLINE Counter omptarget_nvptx_CounterGroup::Next() { + Counter oldVal = atomicAdd(&v_event, (Counter)1); + PRINT(LD_SYNCD, "next event counter 0x%llx with val %lld->%lld\n", + P64(&v_event), P64(oldVal), P64(oldVal + 1)); + + return oldVal; +} + +// set priv to n, to be used in later waitOrRelease +INLINE void omptarget_nvptx_CounterGroup::Complete(Counter &priv, Counter n) { + PRINT(LD_SYNCD, "complete priv counter 0x%llx with val %lld->%lld (+%d)\n", + P64(&priv), P64(priv), P64(priv + n), n); + priv += n; +} + +INLINE void omptarget_nvptx_CounterGroup::Release(Counter priv, + Counter current_event_value) { + if (priv - 1 == current_event_value) { + PRINT(LD_SYNCD, "Release start counter 0x%llx with val %lld->%lld\n", + P64(&v_start), P64(v_start), P64(priv)); + v_start = priv; + } +} + +// check priv and decide if we have to wait or can free the other warps +INLINE void +omptarget_nvptx_CounterGroup::WaitOrRelease(Counter priv, + Counter current_event_value) { + if (priv - 1 == current_event_value) { + PRINT(LD_SYNCD, "Release start counter 0x%llx with val %lld->%lld\n", + P64(&v_start), P64(v_start), P64(priv)); + v_start = priv; + } else { + PRINT(LD_SYNCD, + "Start waiting while start counter 0x%llx with val %lld < %lld\n", + P64(&v_start), P64(v_start), P64(priv)); + while (priv > v_start) { + // IDLE LOOP + // start is volatile: it will be re-loaded at each while loop + } + PRINT(LD_SYNCD, + "Done waiting as start counter 0x%llx with val %lld >= %lld\n", + P64(&v_start), P64(v_start), P64(priv)); + } +} Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/critical.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/critical.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/critical.cu @@ -0,0 +1,32 @@ +//===------ critical.cu - NVPTX OpenMP critical ------------------ CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of critical with KMPC interface +// +//===----------------------------------------------------------------------===// + +#include + +#include "omptarget-nvptx.h" + +EXTERN +void __kmpc_critical(kmp_Indent *loc, int32_t global_tid, + kmp_CriticalName *lck) { + PRINT0(LD_IO, "call to kmpc_critical()\n"); + omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor(); + omp_set_lock(teamDescr.CriticalLock()); +} + +EXTERN +void __kmpc_end_critical(kmp_Indent *loc, int32_t global_tid, + kmp_CriticalName *lck) { + PRINT0(LD_IO, "call to kmpc_end_critical()\n"); + omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor(); + omp_unset_lock(teamDescr.CriticalLock()); +} Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -0,0 +1,324 @@ +//===----- data_sharing.cu - NVPTX OpenMP debug utilities -------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of data sharing environments/ +// +//===----------------------------------------------------------------------===// +#include "omptarget-nvptx.h" +#include + +// Number of threads in the CUDA block. +__device__ static unsigned getNumThreads() { return blockDim.x; } +// Thread ID in the CUDA block +__device__ static unsigned getThreadId() { return threadIdx.x; } +// Warp ID in the CUDA block +__device__ static unsigned getWarpId() { return threadIdx.x / WARPSIZE; } + +// The CUDA thread ID of the master thread. +__device__ static unsigned getMasterThreadId() { + unsigned Mask = WARPSIZE - 1; + return (getNumThreads() - 1) & (~Mask); +} + +// Find the active threads in the warp - return a mask whose n-th bit is set if +// the n-th thread in the warp is active. +__device__ static unsigned getActiveThreadsMask() { + return __BALLOT_SYNC(0xFFFFFFFF, true); +} + +// Return true if this is the first active thread in the warp. +__device__ static bool IsWarpMasterActiveThread() { + unsigned long long Mask = getActiveThreadsMask(); + unsigned long long ShNum = WARPSIZE - (getThreadId() % WARPSIZE); + unsigned long long Sh = Mask << ShNum; + return Sh == 0; +} +// Return true if this is the master thread. +__device__ static bool IsMasterThread() { + return getMasterThreadId() == getThreadId(); +} + +/// Return the provided size aligned to the size of a pointer. +__device__ static size_t AlignVal(size_t Val) { + const size_t Align = (size_t)sizeof(void *); + if (Val & (Align - 1)) { + Val += Align; + Val &= ~(Align - 1); + } + return Val; +} + +#define DSFLAG 0 +#define DSFLAG_INIT 0 +#define DSPRINT(_flag, _str, _args...) \ + { \ + if (_flag) { \ + /*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x, _args);*/ \ + } \ + } +#define DSPRINT0(_flag, _str) \ + { \ + if (_flag) { \ + /*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x);*/ \ + } \ + } + +// Initialize the shared data structures. This is expected to be called for the +// master thread and warp masters. \param RootS: A pointer to the root of the +// data sharing stack. \param InitialDataSize: The initial size of the data in +// the slot. +EXTERN void +__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *rootS, + size_t InitialDataSize) { + + DSPRINT0(DSFLAG_INIT, + "Entering __kmpc_initialize_data_sharing_environment\n"); + + unsigned WID = getWarpId(); + DSPRINT(DSFLAG_INIT, "Warp ID: %d\n", WID); + + omptarget_nvptx_TeamDescr *teamDescr = + &omptarget_nvptx_threadPrivateContext->TeamContext(); + __kmpc_data_sharing_slot *RootS = teamDescr->RootS(WID); + + DataSharingState.SlotPtr[WID] = RootS; + DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0]; + + // We don't need to initialize the frame and active threads. + + DSPRINT(DSFLAG_INIT, "Initial data size: %08x \n", InitialDataSize); + DSPRINT(DSFLAG_INIT, "Root slot at: %016llx \n", (long long)RootS); + DSPRINT(DSFLAG_INIT, "Root slot data-end at: %016llx \n", + (long long)RootS->DataEnd); + DSPRINT(DSFLAG_INIT, "Root slot next at: %016llx \n", (long long)RootS->Next); + DSPRINT(DSFLAG_INIT, "Shared slot ptr at: %016llx \n", + (long long)DataSharingState.SlotPtr[WID]); + DSPRINT(DSFLAG_INIT, "Shared stack ptr at: %016llx \n", + (long long)DataSharingState.StackPtr[WID]); + + DSPRINT0(DSFLAG_INIT, "Exiting __kmpc_initialize_data_sharing_environment\n"); +} + +EXTERN void *__kmpc_data_sharing_environment_begin( + __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack, + void **SavedSharedFrame, int32_t *SavedActiveThreads, + size_t SharingDataSize, size_t SharingDefaultDataSize, + int16_t IsOMPRuntimeInitialized) { + + DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_begin\n"); + + // If the runtime has been elided, used __shared__ memory for master-worker + // data sharing. + if (!IsOMPRuntimeInitialized) + return (void *)&DataSharingState; + + DSPRINT(DSFLAG, "Data Size %016llx\n", SharingDataSize); + DSPRINT(DSFLAG, "Default Data Size %016llx\n", SharingDefaultDataSize); + + unsigned WID = getWarpId(); + unsigned CurActiveThreads = getActiveThreadsMask(); + + __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; + void *&StackP = DataSharingState.StackPtr[WID]; + void *&FrameP = DataSharingState.FramePtr[WID]; + int32_t &ActiveT = DataSharingState.ActiveThreads[WID]; + + DSPRINT0(DSFLAG, "Save current slot/stack values.\n"); + // Save the current values. + *SavedSharedSlot = SlotP; + *SavedSharedStack = StackP; + *SavedSharedFrame = FrameP; + *SavedActiveThreads = ActiveT; + + DSPRINT(DSFLAG, "Warp ID: %d\n", WID); + DSPRINT(DSFLAG, "Saved slot ptr at: %016llx \n", (long long)SlotP); + DSPRINT(DSFLAG, "Saved stack ptr at: %016llx \n", (long long)StackP); + DSPRINT(DSFLAG, "Saved frame ptr at: %016llx \n", (long long)FrameP); + DSPRINT(DSFLAG, "Active threads: %08x \n", ActiveT); + + // Only the warp active master needs to grow the stack. + if (IsWarpMasterActiveThread()) { + // Save the current active threads. + ActiveT = CurActiveThreads; + + // Make sure we use aligned sizes to avoid rematerialization of data. + SharingDataSize = AlignVal(SharingDataSize); + // FIXME: The default data size can be assumed to be aligned? + SharingDefaultDataSize = AlignVal(SharingDefaultDataSize); + + // Check if we have room for the data in the current slot. + const uintptr_t CurrentStartAddress = (uintptr_t)StackP; + const uintptr_t CurrentEndAddress = (uintptr_t)SlotP->DataEnd; + const uintptr_t RequiredEndAddress = + CurrentStartAddress + (uintptr_t)SharingDataSize; + + DSPRINT(DSFLAG, "Data Size %016llx\n", SharingDataSize); + DSPRINT(DSFLAG, "Default Data Size %016llx\n", SharingDefaultDataSize); + DSPRINT(DSFLAG, "Current Start Address %016llx\n", CurrentStartAddress); + DSPRINT(DSFLAG, "Current End Address %016llx\n", CurrentEndAddress); + DSPRINT(DSFLAG, "Required End Address %016llx\n", RequiredEndAddress); + DSPRINT(DSFLAG, "Active Threads %08x\n", ActiveT); + + // If we require a new slot, allocate it and initialize it (or attempt to + // reuse one). Also, set the shared stack and slot pointers to the new + // place. If we do not need to grow the stack, just adapt the stack and + // frame pointers. + if (CurrentEndAddress < RequiredEndAddress) { + size_t NewSize = (SharingDataSize > SharingDefaultDataSize) + ? SharingDataSize + : SharingDefaultDataSize; + __kmpc_data_sharing_slot *NewSlot = 0; + + // Attempt to reuse an existing slot. + if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) { + uintptr_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd - + (uintptr_t)(&ExistingSlot->Data[0]); + if (ExistingSlotSize >= NewSize) { + DSPRINT(DSFLAG, "Reusing stack slot %016llx\n", + (long long)ExistingSlot); + NewSlot = ExistingSlot; + } else { + DSPRINT(DSFLAG, "Cleaning up -failed reuse - %016llx\n", + (long long)SlotP->Next); + free(ExistingSlot); + } + } + + if (!NewSlot) { + NewSlot = (__kmpc_data_sharing_slot *)malloc( + sizeof(__kmpc_data_sharing_slot) + NewSize); + DSPRINT(DSFLAG, "New slot allocated %016llx (data size=%016llx)\n", + (long long)NewSlot, NewSize); + } + + NewSlot->Next = 0; + NewSlot->DataEnd = &NewSlot->Data[NewSize]; + + SlotP->Next = NewSlot; + SlotP = NewSlot; + StackP = &NewSlot->Data[SharingDataSize]; + FrameP = &NewSlot->Data[0]; + } else { + + // Clean up any old slot that we may still have. The slot producers, do + // not eliminate them because that may be used to return data. + if (SlotP->Next) { + DSPRINT(DSFLAG, "Cleaning up - old not required - %016llx\n", + (long long)SlotP->Next); + free(SlotP->Next); + SlotP->Next = 0; + } + + FrameP = StackP; + StackP = (void *)RequiredEndAddress; + } + } + + // FIXME: Need to see the impact of doing it here. + __threadfence_block(); + + DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_begin\n"); + + // All the threads in this warp get the frame they should work with. + return FrameP; +} + +EXTERN void __kmpc_data_sharing_environment_end( + __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack, + void **SavedSharedFrame, int32_t *SavedActiveThreads, + int32_t IsEntryPoint) { + + DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_end\n"); + + unsigned WID = getWarpId(); + + if (IsEntryPoint) { + if (IsWarpMasterActiveThread()) { + DSPRINT0(DSFLAG, "Doing clean up\n"); + + // The master thread cleans the saved slot, because this is an environment + // only for the master. + __kmpc_data_sharing_slot *S = + IsMasterThread() ? *SavedSharedSlot : DataSharingState.SlotPtr[WID]; + + if (S->Next) { + free(S->Next); + S->Next = 0; + } + } + + DSPRINT0(DSFLAG, "Exiting Exiting __kmpc_data_sharing_environment_end\n"); + return; + } + + int32_t CurActive = getActiveThreadsMask(); + + // Only the warp master can restore the stack and frame information, and only + // if there are no other threads left behind in this environment (i.e. the + // warp diverged and returns in different places). This only works if we + // assume that threads will converge right after the call site that started + // the environment. + if (IsWarpMasterActiveThread()) { + int32_t &ActiveT = DataSharingState.ActiveThreads[WID]; + + DSPRINT0(DSFLAG, "Before restoring the stack\n"); + // Zero the bits in the mask. If it is still different from zero, then we + // have other threads that will return after the current ones. + ActiveT &= ~CurActive; + + DSPRINT(DSFLAG, "Active threads: %08x; New mask: %08x\n", CurActive, + ActiveT); + + if (!ActiveT) { + // No other active threads? Great, lets restore the stack. + + __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; + void *&StackP = DataSharingState.StackPtr[WID]; + void *&FrameP = DataSharingState.FramePtr[WID]; + + SlotP = *SavedSharedSlot; + StackP = *SavedSharedStack; + FrameP = *SavedSharedFrame; + ActiveT = *SavedActiveThreads; + + DSPRINT(DSFLAG, "Restored slot ptr at: %016llx \n", (long long)SlotP); + DSPRINT(DSFLAG, "Restored stack ptr at: %016llx \n", (long long)StackP); + DSPRINT(DSFLAG, "Restored frame ptr at: %016llx \n", (long long)FrameP); + DSPRINT(DSFLAG, "Active threads: %08x \n", ActiveT); + } + } + + // FIXME: Need to see the impact of doing it here. + __threadfence_block(); + + DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_end\n"); + return; +} + +EXTERN void * +__kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID, + int16_t IsOMPRuntimeInitialized) { + DSPRINT0(DSFLAG, "Entering __kmpc_get_data_sharing_environment_frame\n"); + + // If the runtime has been elided, use __shared__ memory for master-worker + // data sharing. We're reusing the statically allocated data structure + // that is used for standard data sharing. + if (!IsOMPRuntimeInitialized) + return (void *)&DataSharingState; + + // Get the frame used by the requested thread. + + unsigned SourceWID = SourceThreadID / WARPSIZE; + + DSPRINT(DSFLAG, "Source warp: %d\n", SourceWID); + + void *P = DataSharingState.FramePtr[SourceWID]; + DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n"); + return P; +} Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/debug.h =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/debug.h +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/debug.h @@ -0,0 +1,276 @@ +//===------------- debug.h - NVPTX OpenMP debug macros ----------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains debug macros to be used in the application. +// +// Usage guide +// +// PRINT0(flag, str) : if debug flag is on, print (no arguments) +// PRINT(flag, str, args) : if debug flag is on, print (arguments) +// DON(flag) : return true if debug flag is on +// +// ASSERT(flag, cond, str, args): if test flag is on, test the condition +// if the condition is false, print str+args +// and assert. +// CAUTION: cond may be evaluate twice +// AON(flag) : return true if test flag is on +// +// WARNING(flag, str, args) : if warning flag is on, print the warning +// WON(flag) : return true if warning flag is on +// +//===----------------------------------------------------------------------===// + +#ifndef _OMPTARGET_NVPTX_DEBUG_H_ +#define _OMPTARGET_NVPTX_DEBUG_H_ + +//////////////////////////////////////////////////////////////////////////////// +// set desired level of debugging +//////////////////////////////////////////////////////////////////////////////// + +#define LD_SET_NONE 0ULL /* none */ +#define LD_SET_ALL -1ULL /* all */ + +// pos 1 +#define LD_SET_LOOP 0x1ULL /* basic loop */ +#define LD_SET_LOOPD 0x2ULL /* basic loop */ +#define LD_SET_PAR 0x4ULL /* basic parallel */ +#define LD_SET_PARD 0x8ULL /* basic parallel */ + +// pos 2 +#define LD_SET_SYNC 0x10ULL /* sync info */ +#define LD_SET_SYNCD 0x20ULL /* sync info */ +#define LD_SET_WAIT 0x40ULL /* state when waiting */ +#define LD_SET_TASK 0x80ULL /* print task info (high level) */ + +// pos 3 +#define LD_SET_IO 0x100ULL /* big region io (excl atomic) */ +#define LD_SET_IOD 0x200ULL /* big region io (excl atomic) */ +#define LD_SET_ENV 0x400ULL /* env info */ +#define LD_SET_CANCEL 0x800ULL /* print cancel info */ + +// pos 4 +#define LD_SET_MEM 0x1000ULL /* malloc / free */ + +//////////////////////////////////////////////////////////////////////////////// +// set the desired flags to print selected output. + +// these are some examples of possible definitions that can be used for +// debugging. +//#define OMPTARGET_NVPTX_DEBUG (LD_SET_ALL) +//#define OMPTARGET_NVPTX_DEBUG (LD_SET_LOOP) // limit to loop printfs to save +// on cuda buffer +//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO) +//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO | LD_SET_ENV) +//#define OMPTARGET_NVPTX_DEBUG (LD_SET_PAR) + +#ifndef OMPTARGET_NVPTX_DEBUG +#define OMPTARGET_NVPTX_DEBUG LD_SET_NONE +#elif OMPTARGET_NVPTX_DEBUG +#warning debug is used, not good for measurements +#endif + +//////////////////////////////////////////////////////////////////////////////// +// set desired level of asserts +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// available flags + +#define LT_SET_NONE 0x0 /* unsafe */ +#define LT_SET_SAFETY \ + 0x1 /* check malloc type of stuff, input at creation, cheap */ +#define LT_SET_INPUT 0x2 /* check also all runtime inputs */ +#define LT_SET_FUSSY 0x4 /* fussy checks, expensive */ + +//////////////////////////////////////////////////////////////////////////////// +// set the desired flags + +#ifndef OMPTARGET_NVPTX_TEST +#if OMPTARGET_NVPTX_DEBUG +#define OMPTARGET_NVPTX_TEST (LT_SET_FUSSY) +#else +#define OMPTARGET_NVPTX_TEST (LT_SET_SAFETY) +#endif +#endif + +//////////////////////////////////////////////////////////////////////////////// +// set desired level of warnings +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// available flags + +#define LW_SET_ALL -1 +#define LW_SET_NONE 0x0 +#define LW_SET_ENV 0x1 +#define LW_SET_INPUT 0x2 +#define LW_SET_FUSSY 0x4 + +//////////////////////////////////////////////////////////////////////////////// +// set the desired flags + +#if OMPTARGET_NVPTX_DEBUG +#define OMPTARGET_NVPTX_WARNING (LW_SET_NONE) +#else +#define OMPTARGET_NVPTX_WARNING (LW_SET_FUSSY) +#endif + +//////////////////////////////////////////////////////////////////////////////// +// implemtation for debug +//////////////////////////////////////////////////////////////////////////////// + +#if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING +#include +#endif +#if OMPTARGET_NVPTX_TEST +#include +#endif + +// set flags that are tested (inclusion properties) + +#define LD_ALL (LD_SET_ALL) + +#define LD_LOOP (LD_SET_LOOP | LD_SET_LOOPD) +#define LD_LOOPD (LD_SET_LOOPD) +#define LD_PAR (LD_SET_PAR | LD_SET_PARD) +#define LD_PARD (LD_SET_PARD) + +// pos 2 +#define LD_SYNC (LD_SET_SYNC | LD_SET_SYNCD) +#define LD_SYNCD (LD_SET_SYNCD) +#define LD_WAIT (LD_SET_WAIT) +#define LD_TASK (LD_SET_TASK) + +// pos 3 +#define LD_IO (LD_SET_IO | LD_SET_IOD) +#define LD_IOD (LD_SET_IOD) +#define LD_ENV (LD_SET_ENV) +#define LD_CANCEL (LD_SET_CANCEL) + +// pos 3 +#define LD_MEM (LD_SET_MEM) + +// implement +#if OMPTARGET_NVPTX_DEBUG + +#define DON(_flag) ((OMPTARGET_NVPTX_DEBUG) & (_flag)) + +#define PRINT0(_flag, _str) \ + { \ + if (DON(_flag)) { \ + printf(": " _str, blockIdx.x, threadIdx.x, \ + threadIdx.x / WARPSIZE, threadIdx.x & 0x1F); \ + } \ + } + +#define PRINT(_flag, _str, _args...) \ + { \ + if (DON(_flag)) { \ + printf(": " _str, blockIdx.x, threadIdx.x, \ + threadIdx.x / WARPSIZE, threadIdx.x & 0x1F, _args); \ + } \ + } +#else + +#define DON(_flag) (FALSE) +#define PRINT0(flag, str) +#define PRINT(flag, str, _args...) + +#endif + +// for printing without worring about precision, pointers... +#define P64(_x) ((unsigned long long)(_x)) + +//////////////////////////////////////////////////////////////////////////////// +// early defs for test +//////////////////////////////////////////////////////////////////////////////// + +#define LT_SAFETY (LT_SET_SAFETY | LT_SET_INPUT | LT_SET_FUSSY) +#define LT_INPUT (LT_SET_INPUT | LT_SET_FUSSY) +#define LT_FUSSY (LT_SET_FUSSY) + +#if OMPTARGET_NVPTX_TEST == LT_SET_SAFETY + +#define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag)) +#define ASSERT0(_flag, _cond, _str) \ + { \ + if (TON(_flag)) { \ + assert(_cond); \ + } \ + } +#define ASSERT(_flag, _cond, _str, _args...) \ + { \ + if (TON(_flag)) { \ + assert(_cond); \ + } \ + } + +#elif OMPTARGET_NVPTX_TEST >= LT_SET_INPUT + +#define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag)) +#define ASSERT0(_flag, _cond, _str) \ + { \ + if (TON(_flag) && !(_cond)) { \ + printf(" ASSERT: " _str "\n", blockIdx.x, \ + threadIdx.x, threadIdx.x / WARPSIZE, threadIdx.x & 0x1F); \ + assert(_cond); \ + } \ + } +#define ASSERT(_flag, _cond, _str, _args...) \ + { \ + if (TON(_flag) && !(_cond)) { \ + printf(" ASSERT: " _str "\n", blockIdx.x, \ + threadIdx.x, threadIdx.x / WARPSIZE, threadIdx.x & 0x1F, _args); \ + assert(_cond); \ + } \ + } + +#else + +#define TON(_flag) (FALSE) +#define ASSERT0(_flag, _cond, _str) +#define ASSERT(_flag, _cond, _str, _args...) + +#endif + +//////////////////////////////////////////////////////////////////////////////// +// early defs for warning + +#define LW_ALL (LW_SET_ALL) +#define LW_ENV (LW_SET_FUSSY | LW_SET_INPUT | LW_SET_ENV) +#define LW_INPUT (LW_SET_FUSSY | LW_SET_INPUT) +#define LW_FUSSY (LW_SET_FUSSY) + +#if OMPTARGET_NVPTX_WARNING + +#define WON(_flag) ((OMPTARGET_NVPTX_WARNING) & (_flag)) +#define WARNING0(_flag, _str) \ + { \ + if (WON(_flag)) { \ + printf(" WARNING: " _str, blockIdx.x, \ + threadIdx.x, threadIdx.x / WARPSIZE, threadIdx.x & 0x1F); \ + } \ + } +#define WARNING(_flag, _str, _args...) \ + { \ + if (WON(_flag)) { \ + printf(" WARNING: " _str, blockIdx.x, \ + threadIdx.x, threadIdx.x / WARPSIZE, threadIdx.x & 0x1F, _args); \ + } \ + } + +#else + +#define WON(_flag) (FALSE) +#define WARNING0(_flag, _str) +#define WARNING(_flag, _str, _args...) + +#endif + +#endif Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h @@ -0,0 +1,509 @@ +//===------- interface.h - NVPTX OpenMP interface definitions ---- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains debug macros to be used in the application. +// +// This file contains all the definitions that are relevant to +// the interface. The first section contains the interface as +// declared by OpenMP. A second section includes library private calls +// (mostly debug, temporary?) The third section includes the compiler +// specific interfaces. +// +//===----------------------------------------------------------------------===// + +#ifndef _INTERFACES_H_ +#define _INTERFACES_H_ + +//////////////////////////////////////////////////////////////////////////////// +// OpenMP interface +//////////////////////////////////////////////////////////////////////////////// + +typedef uint32_t omp_lock_t; /* arbitrary type of the right length */ +typedef uint64_t omp_nest_lock_t; /* arbitrary type of the right length */ + +typedef enum omp_sched_t { + omp_sched_static = 1, /* chunkSize >0 */ + omp_sched_dynamic = 2, /* chunkSize >0 */ + omp_sched_guided = 3, /* chunkSize >0 */ + omp_sched_auto = 4, /* no chunkSize */ +} omp_sched_t; + +typedef enum omp_proc_bind_t { + omp_proc_bind_false = 0, + omp_proc_bind_true = 1, + omp_proc_bind_master = 2, + omp_proc_bind_close = 3, + omp_proc_bind_spread = 4 +} omp_proc_bind_t; + +EXTERN double omp_get_wtick(void); +EXTERN double omp_get_wtime(void); + +EXTERN void omp_set_num_threads(int num); +EXTERN int omp_get_num_threads(void); +EXTERN int omp_get_max_threads(void); +EXTERN int omp_get_thread_limit(void); +EXTERN int omp_get_thread_num(void); +EXTERN int omp_get_num_procs(void); +EXTERN int omp_in_parallel(void); +EXTERN int omp_in_final(void); +EXTERN void omp_set_dynamic(int flag); +EXTERN int omp_get_dynamic(void); +EXTERN void omp_set_nested(int flag); +EXTERN int omp_get_nested(void); +EXTERN void omp_set_max_active_levels(int level); +EXTERN int omp_get_max_active_levels(void); +EXTERN int omp_get_level(void); +EXTERN int omp_get_active_level(void); +EXTERN int omp_get_ancestor_thread_num(int level); +EXTERN int omp_get_team_size(int level); + +EXTERN void omp_init_lock(omp_lock_t *lock); +EXTERN void omp_init_nest_lock(omp_nest_lock_t *lock); +EXTERN void omp_destroy_lock(omp_lock_t *lock); +EXTERN void omp_destroy_nest_lock(omp_nest_lock_t *lock); +EXTERN void omp_set_lock(omp_lock_t *lock); +EXTERN void omp_set_nest_lock(omp_nest_lock_t *lock); +EXTERN void omp_unset_lock(omp_lock_t *lock); +EXTERN void omp_unset_nest_lock(omp_nest_lock_t *lock); +EXTERN int omp_test_lock(omp_lock_t *lock); +EXTERN int omp_test_nest_lock(omp_nest_lock_t *lock); + +EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier); +EXTERN void omp_set_schedule(omp_sched_t kind, int modifier); +EXTERN omp_proc_bind_t omp_get_proc_bind(void); +EXTERN int omp_get_cancellation(void); +EXTERN void omp_set_default_device(int deviceId); +EXTERN int omp_get_default_device(void); +EXTERN int omp_get_num_devices(void); +EXTERN int omp_get_num_teams(void); +EXTERN int omp_get_team_num(void); +EXTERN int omp_is_initial_device(void); +EXTERN int omp_get_initial_device(void); +EXTERN int omp_get_max_task_priority(void); + +//////////////////////////////////////////////////////////////////////////////// +// OMPTARGET_NVPTX private (debug / temportary?) interface +//////////////////////////////////////////////////////////////////////////////// + +// for debug +EXTERN void __kmpc_print_str(char *title); +EXTERN void __kmpc_print_title_int(char *title, int data); +EXTERN void __kmpc_print_index(char *title, int i); +EXTERN void __kmpc_print_int(int data); +EXTERN void __kmpc_print_double(double data); +EXTERN void __kmpc_print_address_int64(int64_t data); + +//////////////////////////////////////////////////////////////////////////////// +// file below is swiped from kmpc host interface +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// kmp specifc types +//////////////////////////////////////////////////////////////////////////////// + +typedef enum kmp_sched_t { + kmp_sched_static_chunk = 33, + kmp_sched_static_nochunk = 34, + kmp_sched_dynamic = 35, + kmp_sched_guided = 36, + kmp_sched_runtime = 37, + kmp_sched_auto = 38, + + kmp_sched_static_ordered = 65, + kmp_sched_static_nochunk_ordered = 66, + kmp_sched_dynamic_ordered = 67, + kmp_sched_guided_ordered = 68, + kmp_sched_runtime_ordered = 69, + kmp_sched_auto_ordered = 70, + + kmp_sched_distr_static_chunk = 91, + kmp_sched_distr_static_nochunk = 92, + kmp_sched_distr_static_chunk_sched_static_chunkone = 93, + + kmp_sched_default = kmp_sched_static_nochunk, + kmp_sched_unordered_first = kmp_sched_static_chunk, + kmp_sched_unordered_last = kmp_sched_auto, + kmp_sched_ordered_first = kmp_sched_static_ordered, + kmp_sched_ordered_last = kmp_sched_auto_ordered, + kmp_sched_distribute_first = kmp_sched_distr_static_chunk, + kmp_sched_distribute_last = + kmp_sched_distr_static_chunk_sched_static_chunkone, + + /* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers. + * Since we need to distinguish the three possible cases (no modifier, + * monotonic modifier, nonmonotonic modifier), we need separate bits for + * each modifier. The absence of monotonic does not imply nonmonotonic, + * especially since 4.5 says that the behaviour of the "no modifier" case + * is implementation defined in 4.5, but will become "nonmonotonic" in 5.0. + * + * Since we're passing a full 32 bit value, we can use a couple of high + * bits for these flags; out of paranoia we avoid the sign bit. + * + * These modifiers can be or-ed into non-static schedules by the compiler + * to pass the additional information. They will be stripped early in the + * processing in __kmp_dispatch_init when setting up schedules, so + * most of the code won't ever see schedules with these bits set. + */ + kmp_sched_modifier_monotonic = (1 << 29), + /**< Set if the monotonic schedule modifier was present */ + kmp_sched_modifier_nonmonotonic = (1 << 30), +/**< Set if the nonmonotonic schedule modifier was present */ + +#define SCHEDULE_WITHOUT_MODIFIERS(s) \ + (enum kmp_sched_t)( \ + (s) & ~(kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) +#define SCHEDULE_HAS_MONOTONIC(s) (((s)&kmp_sched_modifier_monotonic) != 0) +#define SCHEDULE_HAS_NONMONOTONIC(s) \ + (((s)&kmp_sched_modifier_nonmonotonic) != 0) +#define SCHEDULE_HAS_NO_MODIFIERS(s) \ + (((s) & (kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) == \ + 0) + +} kmp_sched_t; + +// parallel defs +typedef void kmp_Indent; +typedef void (*kmp_ParFctPtr)(int32_t *global_tid, int32_t *bound_tid, ...); +typedef void (*kmp_ReductFctPtr)(void *lhsData, void *rhsData); +typedef void (*kmp_InterWarpCopyFctPtr)(void *src, int32_t warp_num); +typedef void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, + int16_t lane_offset, + int16_t shortCircuit); +typedef void (*kmp_CopyToScratchpadFctPtr)(void *reduceData, void *scratchpad, + int32_t index, int32_t width); +typedef void (*kmp_LoadReduceFctPtr)(void *reduceData, void *scratchpad, + int32_t index, int32_t width, + int32_t reduce); + +// task defs +typedef struct kmp_TaskDescr kmp_TaskDescr; +typedef int32_t (*kmp_TaskFctPtr)(int32_t global_tid, kmp_TaskDescr *taskDescr); +typedef struct kmp_TaskDescr { + void *sharedPointerTable; // ptr to a table of shared var ptrs + kmp_TaskFctPtr sub; // task subroutine + int32_t partId; // unused + kmp_TaskFctPtr destructors; // destructor of c++ first private +} kmp_TaskDescr; +// task dep defs +#define KMP_TASKDEP_IN 0x1u +#define KMP_TASKDEP_OUT 0x2u +typedef struct kmp_TaskDep_Public { + void *addr; + size_t len; + uint8_t flags; // bit 0: in, bit 1: out +} kmp_TaskDep_Public; + +// flags that interpret the interface part of tasking flags +#define KMP_TASK_IS_TIED 0x1 +#define KMP_TASK_FINAL 0x2 +#define KMP_TASK_MERGED_IF0 0x4 /* unused */ +#define KMP_TASK_DESTRUCTOR_THUNK 0x8 + +// flags for task setup return +#define KMP_CURRENT_TASK_NOT_SUSPENDED 0 +#define KMP_CURRENT_TASK_SUSPENDED 1 + +// sync defs +typedef int32_t kmp_CriticalName[8]; + +//////////////////////////////////////////////////////////////////////////////// +// flags for kstate (all bits initially off) +//////////////////////////////////////////////////////////////////////////////// + +// first 2 bits used by kmp_Reduction (defined in kmp_reduction.cpp) +#define KMP_REDUCTION_MASK 0x3 +#define KMP_SKIP_NEXT_CALL 0x4 +#define KMP_SKIP_NEXT_CANCEL_BARRIER 0x8 + +//////////////////////////////////////////////////////////////////////////////// +// data +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// external interface +//////////////////////////////////////////////////////////////////////////////// + +// query +EXTERN int32_t __kmpc_global_num_threads(kmp_Indent *loc); // missing +EXTERN int32_t __kmpc_bound_thread_num(kmp_Indent *loc); // missing +EXTERN int32_t __kmpc_bound_num_threads(kmp_Indent *loc); // missing +EXTERN int32_t __kmpc_in_parallel(kmp_Indent *loc); // missing + +// parallel +EXTERN int32_t __kmpc_global_thread_num(kmp_Indent *loc); +EXTERN void __kmpc_push_num_threads(kmp_Indent *loc, int32_t global_tid, + int32_t num_threads); +// simd +EXTERN void __kmpc_push_simd_limit(kmp_Indent *loc, int32_t global_tid, + int32_t simd_limit); +// aee ... not supported +// EXTERN void __kmpc_fork_call(kmp_Indent *loc, int32_t argc, kmp_ParFctPtr +// microtask, ...); +EXTERN void __kmpc_serialized_parallel(kmp_Indent *loc, uint32_t global_tid); +EXTERN void __kmpc_end_serialized_parallel(kmp_Indent *loc, + uint32_t global_tid); +EXTERN uint16_t __kmpc_parallel_level(kmp_Indent *loc, uint32_t global_tid); + +// proc bind +EXTERN void __kmpc_push_proc_bind(kmp_Indent *loc, uint32_t global_tid, + int proc_bind); +EXTERN int omp_get_num_places(void); +EXTERN int omp_get_place_num_procs(int place_num); +EXTERN void omp_get_place_proc_ids(int place_num, int *ids); +EXTERN int omp_get_place_num(void); +EXTERN int omp_get_partition_num_places(void); +EXTERN void omp_get_partition_place_nums(int *place_nums); + +// for static (no chunk or chunk) +EXTERN void __kmpc_for_static_init_4(kmp_Indent *loc, int32_t global_tid, + int32_t sched, int32_t *plastiter, + int32_t *plower, int32_t *pupper, + int32_t *pstride, int32_t incr, + int32_t chunk); +EXTERN void __kmpc_for_static_init_4u(kmp_Indent *loc, int32_t global_tid, + int32_t sched, int32_t *plastiter, + uint32_t *plower, uint32_t *pupper, + int32_t *pstride, int32_t incr, + int32_t chunk); +EXTERN void __kmpc_for_static_init_8(kmp_Indent *loc, int32_t global_tid, + int32_t sched, int32_t *plastiter, + int64_t *plower, int64_t *pupper, + int64_t *pstride, int64_t incr, + int64_t chunk); +EXTERN void __kmpc_for_static_init_8u(kmp_Indent *loc, int32_t global_tid, + int32_t sched, int32_t *plastiter1, + uint64_t *plower, uint64_t *pupper, + int64_t *pstride, int64_t incr, + int64_t chunk); +EXTERN +void __kmpc_for_static_init_4_simple_spmd(kmp_Indent *loc, int32_t global_tid, + int32_t sched, int32_t *plastiter, + int32_t *plower, int32_t *pupper, + int32_t *pstride, int32_t incr, + int32_t chunk); +EXTERN +void __kmpc_for_static_init_4u_simple_spmd(kmp_Indent *loc, int32_t global_tid, + int32_t sched, int32_t *plastiter, + uint32_t *plower, uint32_t *pupper, + int32_t *pstride, int32_t incr, + int32_t chunk); +EXTERN +void __kmpc_for_static_init_8_simple_spmd(kmp_Indent *loc, int32_t global_tid, + int32_t sched, int32_t *plastiter, + int64_t *plower, int64_t *pupper, + int64_t *pstride, int64_t incr, + int64_t chunk); +EXTERN +void __kmpc_for_static_init_8u_simple_spmd(kmp_Indent *loc, int32_t global_tid, + int32_t sched, int32_t *plastiter1, + uint64_t *plower, uint64_t *pupper, + int64_t *pstride, int64_t incr, + int64_t chunk); +EXTERN +void __kmpc_for_static_init_4_simple_generic(kmp_Indent *loc, + int32_t global_tid, int32_t sched, + int32_t *plastiter, + int32_t *plower, int32_t *pupper, + int32_t *pstride, int32_t incr, + int32_t chunk); +EXTERN +void __kmpc_for_static_init_4u_simple_generic( + kmp_Indent *loc, int32_t global_tid, int32_t sched, int32_t *plastiter, + uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr, + int32_t chunk); +EXTERN +void __kmpc_for_static_init_8_simple_generic(kmp_Indent *loc, + int32_t global_tid, int32_t sched, + int32_t *plastiter, + int64_t *plower, int64_t *pupper, + int64_t *pstride, int64_t incr, + int64_t chunk); +EXTERN +void __kmpc_for_static_init_8u_simple_generic( + kmp_Indent *loc, int32_t global_tid, int32_t sched, int32_t *plastiter1, + uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr, + int64_t chunk); + +EXTERN void __kmpc_for_static_fini(kmp_Indent *loc, int32_t global_tid); + +// for dynamic +EXTERN void __kmpc_dispatch_init_4(kmp_Indent *loc, int32_t global_tid, + int32_t sched, int32_t lower, int32_t upper, + int32_t incr, int32_t chunk); +EXTERN void __kmpc_dispatch_init_4u(kmp_Indent *loc, int32_t global_tid, + int32_t sched, uint32_t lower, + uint32_t upper, int32_t incr, + int32_t chunk); +EXTERN void __kmpc_dispatch_init_8(kmp_Indent *loc, int32_t global_tid, + int32_t sched, int64_t lower, int64_t upper, + int64_t incr, int64_t chunk); +EXTERN void __kmpc_dispatch_init_8u(kmp_Indent *loc, int32_t global_tid, + int32_t sched, uint64_t lower, + uint64_t upper, int64_t incr, + int64_t chunk); + +EXTERN int __kmpc_dispatch_next_4(kmp_Indent *loc, int32_t global_tid, + int32_t *plastiter, int32_t *plower, + int32_t *pupper, int32_t *pstride); +EXTERN int __kmpc_dispatch_next_4u(kmp_Indent *loc, int32_t global_tid, + int32_t *plastiter, uint32_t *plower, + uint32_t *pupper, int32_t *pstride); +EXTERN int __kmpc_dispatch_next_8(kmp_Indent *loc, int32_t global_tid, + int32_t *plastiter, int64_t *plower, + int64_t *pupper, int64_t *pstride); +EXTERN int __kmpc_dispatch_next_8u(kmp_Indent *loc, int32_t global_tid, + int32_t *plastiter, uint64_t *plower, + uint64_t *pupper, int64_t *pstride); + +EXTERN void __kmpc_dispatch_fini_4(kmp_Indent *loc, int32_t global_tid); +EXTERN void __kmpc_dispatch_fini_4u(kmp_Indent *loc, int32_t global_tid); +EXTERN void __kmpc_dispatch_fini_8(kmp_Indent *loc, int32_t global_tid); +EXTERN void __kmpc_dispatch_fini_8u(kmp_Indent *loc, int32_t global_tid); + +// Support for reducing conditional lastprivate variables +EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Indent *loc, + int32_t global_tid, + int32_t varNum, void *array); + +// reduction +EXTERN void __kmpc_nvptx_end_reduce(int32_t global_tid); +EXTERN void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid); +EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct); +EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct); +EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct); +EXTERN int32_t __kmpc_nvptx_simd_reduce_nowait( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct); +EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, + kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct); +EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, + kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct); +EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, + kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct); +EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size); +EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size); + +// sync barrier +EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid); +EXTERN void __kmpc_barrier_simple_spmd(kmp_Indent *loc_ref, int32_t tid); +EXTERN void __kmpc_barrier_simple_generic(kmp_Indent *loc_ref, int32_t tid); +EXTERN int32_t __kmpc_cancel_barrier(kmp_Indent *loc, int32_t global_tid); + +// single +EXTERN int32_t __kmpc_single(kmp_Indent *loc, int32_t global_tid); +EXTERN void __kmpc_end_single(kmp_Indent *loc, int32_t global_tid); + +// sync +EXTERN int32_t __kmpc_master(kmp_Indent *loc, int32_t global_tid); +EXTERN void __kmpc_end_master(kmp_Indent *loc, int32_t global_tid); +EXTERN void __kmpc_ordered(kmp_Indent *loc, int32_t global_tid); +EXTERN void __kmpc_end_ordered(kmp_Indent *loc, int32_t global_tid); +EXTERN void __kmpc_critical(kmp_Indent *loc, int32_t global_tid, + kmp_CriticalName *crit); +EXTERN void __kmpc_end_critical(kmp_Indent *loc, int32_t global_tid, + kmp_CriticalName *crit); +EXTERN void __kmpc_flush(kmp_Indent *loc); + +// vote +EXTERN int32_t __kmpc_warp_active_thread_mask(); + +// tasks +EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(kmp_Indent *loc, + uint32_t global_tid, int32_t flag, + size_t sizeOfTaskInclPrivate, + size_t sizeOfSharedTable, + kmp_TaskFctPtr sub); +EXTERN int32_t __kmpc_omp_task(kmp_Indent *loc, uint32_t global_tid, + kmp_TaskDescr *newLegacyTaskDescr); +EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Indent *loc, uint32_t global_tid, + kmp_TaskDescr *newLegacyTaskDescr, + int32_t depNum, void *depList, + int32_t noAliasDepNum, + void *noAliasDepList); +EXTERN void __kmpc_omp_task_begin_if0(kmp_Indent *loc, uint32_t global_tid, + kmp_TaskDescr *newLegacyTaskDescr); +EXTERN void __kmpc_omp_task_complete_if0(kmp_Indent *loc, uint32_t global_tid, + kmp_TaskDescr *newLegacyTaskDescr); +EXTERN void __kmpc_omp_wait_deps(kmp_Indent *loc, uint32_t global_tid, + int32_t depNum, void *depList, + int32_t noAliasDepNum, void *noAliasDepList); +EXTERN void __kmpc_taskgroup(kmp_Indent *loc, uint32_t global_tid); +EXTERN void __kmpc_end_taskgroup(kmp_Indent *loc, uint32_t global_tid); +EXTERN int32_t __kmpc_omp_taskyield(kmp_Indent *loc, uint32_t global_tid, + int end_part); +EXTERN int32_t __kmpc_omp_taskwait(kmp_Indent *loc, uint32_t global_tid); +EXTERN void __kmpc_taskloop(kmp_Indent *loc, uint32_t global_tid, + kmp_TaskDescr *newKmpTaskDescr, int if_val, + uint64_t *lb, uint64_t *ub, int64_t st, int nogroup, + int32_t sched, uint64_t grainsize, void *task_dup); + +// cancel +EXTERN int32_t __kmpc_cancellationpoint(kmp_Indent *loc, int32_t global_tid, + int32_t cancelVal); +EXTERN int32_t __kmpc_cancel(kmp_Indent *loc, int32_t global_tid, + int32_t cancelVal); + +// non standard +EXTERN void __kmpc_kernel_init_params(void *ReductionScratchpadPtr); +EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime); +EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized); +EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime, + int16_t RequiresDataSharing); +EXTERN void __kmpc_spmd_kernel_deinit(); +EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, + int16_t IsOMPRuntimeInitialized); +EXTERN bool __kmpc_kernel_parallel(void **WorkFn, + int16_t IsOMPRuntimeInitialized); +EXTERN void __kmpc_kernel_end_parallel(); +EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask, + bool *IsFinal, + int32_t *LaneSource); +EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer); +EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask, + bool *IsFinal, int32_t *LaneSource, + int32_t *LaneId, int32_t *NumLanes); +EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer); + +// The slot used for data sharing by the master and worker threads. We use a +// complete (default size version and an incomplete one so that we allow sizes +// greater than the default). +struct __kmpc_data_sharing_slot { + __kmpc_data_sharing_slot *Next; + void *DataEnd; + char Data[]; +}; +EXTERN void +__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *RootS, + size_t InitialDataSize); +EXTERN void *__kmpc_data_sharing_environment_begin( + __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack, + void **SavedSharedFrame, int32_t *SavedActiveThreads, + size_t SharingDataSize, size_t SharingDefaultDataSize, + int16_t IsOMPRuntimeInitialized); +EXTERN void __kmpc_data_sharing_environment_end( + __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack, + void **SavedSharedFrame, int32_t *SavedActiveThreads, int32_t IsEntryPoint); + +EXTERN void * +__kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID, + int16_t IsOMPRuntimeInitialized); +#endif Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu @@ -0,0 +1,462 @@ +//===------------ libcall.cu - NVPTX OpenMP user calls ----------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the OpenMP runtime functions that can be +// invoked by the user in an OpenMP region +// +//===----------------------------------------------------------------------===// + +#include "omptarget-nvptx.h" + +// Timer precision is 1ns +#define TIMER_PRECISION ((double)1E-9) + +EXTERN double omp_get_wtick(void) { + PRINT(LD_IO, "omp_get_wtick() returns %g\n", TIMER_PRECISION); + return TIMER_PRECISION; +} + +EXTERN double omp_get_wtime(void) { + unsigned long long nsecs; + asm("mov.u64 %0, %%globaltimer;" : "=l"(nsecs)); + double rc = (double)nsecs * TIMER_PRECISION; + PRINT(LD_IO, "call omp_get_wtime() returns %g\n", rc); + return rc; +} + +EXTERN void omp_set_num_threads(int num) { + PRINT(LD_IO, "call omp_set_num_threads(num %d)\n", num); + if (num <= 0) { + WARNING0(LW_INPUT, "expected positive num; ignore\n"); + } else { + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + currTaskDescr->NThreads() = num; + } +} + +EXTERN int omp_get_num_threads(void) { + int tid = GetLogicalThreadIdInBlock(); + int rc = GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()); + PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc); + return rc; +} + +EXTERN int omp_get_max_threads(void) { + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + int rc = 1; // default is 1 thread avail + if (!currTaskDescr->InParallelRegion()) { + // not currently in a parallel region... all are available + rc = GetNumberOfProcsInTeam(); + ASSERT0(LT_FUSSY, rc >= 0, "bad number of threads"); + } + PRINT(LD_IO, "call omp_get_max_threads() return %\n", rc); + return rc; +} + +EXTERN int omp_get_thread_limit(void) { + // per contention group.. meaning threads in current team + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + int rc = currTaskDescr->ThreadLimit(); + PRINT(LD_IO, "call omp_get_thread_limit() return %d\n", rc); + return rc; +} + +EXTERN int omp_get_thread_num() { + int tid = GetLogicalThreadIdInBlock(); + int rc = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()); + PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc); + return rc; +} + +EXTERN int omp_get_num_procs(void) { + int rc = GetNumberOfProcsInDevice(); + PRINT(LD_IO, "call omp_get_num_procs() returns %d\n", rc); + return rc; +} + +EXTERN int omp_in_parallel(void) { + int rc = 0; + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + if (currTaskDescr->InParallelRegion()) { + rc = 1; + } + PRINT(LD_IO, "call omp_in_parallel() returns %d\n", rc); + return rc; +} + +EXTERN int omp_in_final(void) { + // treat all tasks as final... Specs may expect runtime to keep + // track more precisely if a task was actively set by users... This + // is not explicitely specified; will treat as if runtime can + // actively decide to put a non-final task into a final one. + int rc = 1; + PRINT(LD_IO, "call omp_in_final() returns %d\n", rc); + return rc; +} + +EXTERN void omp_set_dynamic(int flag) { + PRINT(LD_IO, "call omp_set_dynamic(%d)\n", flag); + + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + if (flag) { + currTaskDescr->SetDynamic(); + } else { + currTaskDescr->ClearDynamic(); + } +} + +EXTERN int omp_get_dynamic(void) { + int rc = 0; + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + if (currTaskDescr->IsDynamic()) { + rc = 1; + } + PRINT(LD_IO, "call omp_get_dynamic() returns %d\n", rc); + return rc; +} + +EXTERN void omp_set_nested(int flag) { + PRINT(LD_IO, "call omp_set_nested(%d) is ignored (no nested support)\n", + flag); +} + +EXTERN int omp_get_nested(void) { + int rc = 0; + PRINT(LD_IO, "call omp_get_nested() returns %d\n", rc); + return rc; +} + +EXTERN void omp_set_max_active_levels(int level) { + PRINT(LD_IO, + "call omp_set_max_active_levels(%d) is ignored (no nested support)\n", + level); +} + +EXTERN int omp_get_max_active_levels(void) { + int rc = 1; + PRINT(LD_IO, "call omp_get_max_active_levels() returns %d\n", rc); + return rc; +} + +EXTERN int omp_get_level(void) { + int level = 0; + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + ASSERT0(LT_FUSSY, currTaskDescr, + "do not expect fct to be called in a non-active thread"); + do { + if (currTaskDescr->IsParallelConstruct()) { + level++; + } + currTaskDescr = currTaskDescr->GetPrevTaskDescr(); + } while (currTaskDescr); + PRINT(LD_IO, "call omp_get_level() returns %d\n", level); + return level; +} + +EXTERN int omp_get_active_level(void) { + int level = 0; // no active level parallelism + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + ASSERT0(LT_FUSSY, currTaskDescr, + "do not expect fct to be called in a non-active thread"); + do { + if (currTaskDescr->ThreadsInTeam() > 1) { + // has a parallel with more than one thread in team + level = 1; + break; + } + currTaskDescr = currTaskDescr->GetPrevTaskDescr(); + } while (currTaskDescr); + PRINT(LD_IO, "call omp_get_active_level() returns %d\n", level) + return level; +} + +EXTERN int omp_get_ancestor_thread_num(int level) { + int rc = 0; // default at level 0 + if (level >= 0) { + int totLevel = omp_get_level(); + if (level <= totLevel) { + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + int steps = totLevel - level; + PRINT(LD_IO, "backtrack %d steps\n", steps); + ASSERT0(LT_FUSSY, currTaskDescr, + "do not expect fct to be called in a non-active thread"); + do { + if (DON(LD_IOD)) { + // print current state + omp_sched_t sched = currTaskDescr->GetRuntimeSched(); + PRINT(LD_ALL, + "task descr %s %d: %s, in par %d, dyn %d, rt sched %d," + " chunk %lld; tid %d, tnum %d, nthreads %d\n", + "ancestor", steps, + (currTaskDescr->IsParallelConstruct() ? "par" : "task"), + currTaskDescr->InParallelRegion(), currTaskDescr->IsDynamic(), + sched, currTaskDescr->RuntimeChunkSize(), + currTaskDescr->ThreadId(), currTaskDescr->ThreadsInTeam(), + currTaskDescr->NThreads()); + } + + if (currTaskDescr->IsParallelConstruct()) { + // found the level + if (!steps) { + rc = currTaskDescr->ThreadId(); + break; + } + steps--; + } + currTaskDescr = currTaskDescr->GetPrevTaskDescr(); + } while (currTaskDescr); + ASSERT0(LT_FUSSY, !steps, "expected to find all steps"); + } + } + PRINT(LD_IO, "call omp_get_ancestor_thread_num(level %d) returns %d\n", level, + rc) + return rc; +} + +EXTERN int omp_get_team_size(int level) { + int rc = 1; // default at level 0 + if (level >= 0) { + int totLevel = omp_get_level(); + if (level <= totLevel) { + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + int steps = totLevel - level; + ASSERT0(LT_FUSSY, currTaskDescr, + "do not expect fct to be called in a non-active thread"); + do { + if (currTaskDescr->IsParallelConstruct()) { + if (!steps) { + // found the level + rc = currTaskDescr->ThreadsInTeam(); + break; + } + steps--; + } + currTaskDescr = currTaskDescr->GetPrevTaskDescr(); + } while (currTaskDescr); + ASSERT0(LT_FUSSY, !steps, "expected to find all steps"); + } + } + PRINT(LD_IO, "call omp_get_team_size(level %d) returns %d\n", level, rc) + return rc; +} + +EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier) { + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + *kind = currTaskDescr->GetRuntimeSched(); + *modifier = currTaskDescr->RuntimeChunkSize(); + PRINT(LD_IO, "call omp_get_schedule returns sched %d and modif %d\n", + (int)*kind, *modifier); +} + +EXTERN void omp_set_schedule(omp_sched_t kind, int modifier) { + PRINT(LD_IO, "call omp_set_schedule(sched %d, modif %d)\n", (int)kind, + modifier); + if (kind >= omp_sched_static && kind < omp_sched_auto) { + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + currTaskDescr->SetRuntimeSched(kind); + currTaskDescr->RuntimeChunkSize() = modifier; + PRINT(LD_IOD, "omp_set_schedule did set sched %d & modif %d\n", + (int)currTaskDescr->GetRuntimeSched(), + currTaskDescr->RuntimeChunkSize()); + } +} + +EXTERN omp_proc_bind_t omp_get_proc_bind(void) { + PRINT0(LD_IO, "call omp_get_proc_bin() is true, regardless on state\n"); + return omp_proc_bind_true; +} + +EXTERN int omp_get_num_places(void) { + PRINT0(LD_IO, "call omp_get_num_places() returns 0\n"); + return 0; +} + +EXTERN int omp_get_place_num_procs(int place_num) { + PRINT0(LD_IO, "call omp_get_place_num_procs() returns 0\n"); + return 0; +} + +EXTERN void omp_get_place_proc_ids(int place_num, int *ids) { + PRINT0(LD_IO, "call to omp_get_place_proc_ids()\n"); +} + +EXTERN int omp_get_place_num(void) { + PRINT0(LD_IO, "call to omp_get_place_num() returns 0\n"); + return 0; +} + +EXTERN int omp_get_partition_num_places(void) { + PRINT0(LD_IO, "call to omp_get_partition_num_places() returns 0\n"); + return 0; +} + +EXTERN void omp_get_partition_place_nums(int *place_nums) { + PRINT0(LD_IO, "call to omp_get_partition_place_nums()\n"); +} + +EXTERN int omp_get_cancellation(void) { + int rc = FALSE; // currently false only + PRINT(LD_IO, "call omp_get_cancellation() returns %d\n", rc); + return rc; +} + +EXTERN void omp_set_default_device(int deviceId) { + PRINT0(LD_IO, "call omp_get_default_device() is undef on device\n"); +} + +EXTERN int omp_get_default_device(void) { + PRINT0(LD_IO, + "call omp_get_default_device() is undef on device, returns 0\n"); + return 0; +} + +EXTERN int omp_get_num_devices(void) { + PRINT0(LD_IO, "call omp_get_num_devices() is undef on device, returns 0\n"); + return 0; +} + +EXTERN int omp_get_num_teams(void) { + int rc = GetNumberOfOmpTeams(); + PRINT(LD_IO, "call omp_get_num_teams() returns %d\n", rc); + return rc; +} + +EXTERN int omp_get_team_num() { + int rc = GetOmpTeamId(); + PRINT(LD_IO, "call omp_get_team_num() returns %d\n", rc); + return rc; +} + +EXTERN int omp_is_initial_device(void) { + PRINT0(LD_IO, "call omp_is_initial_device() returns 0\n"); + return 0; // 0 by def on device +} + +// Unspecified on the device. +EXTERN int omp_get_initial_device(void) { + PRINT0(LD_IO, "call omp_get_initial_device() returns 0\n"); + return 0; +} + +// Unused for now. +EXTERN int omp_get_max_task_priority(void) { + PRINT0(LD_IO, "call omp_get_max_task_priority() returns 0\n"); + return 0; +} + +//////////////////////////////////////////////////////////////////////////////// +// locks +//////////////////////////////////////////////////////////////////////////////// + +#define __OMP_SPIN 1000 +#define UNSET 0 +#define SET 1 + +EXTERN void omp_init_lock(omp_lock_t *lock) { + *lock = UNSET; + PRINT0(LD_IO, "call omp_init_lock()\n"); +} + +EXTERN void omp_destroy_lock(omp_lock_t *lock) { + PRINT0(LD_IO, "call omp_destroy_lock()\n"); +} + +EXTERN void omp_set_lock(omp_lock_t *lock) { + // int atomicCAS(int* address, int compare, int val); + // (old == compare ? val : old) + int compare = UNSET; + int val = SET; + + // TODO: not sure spinning is a good idea here.. + while (atomicCAS(lock, compare, val) != UNSET) { + + clock_t start = clock(); + clock_t now; + for (;;) { + now = clock(); + clock_t cycles = now > start ? now - start : now + (0xffffffff - start); + if (cycles >= __OMP_SPIN * blockIdx.x) { + break; + } + } + } // wait for 0 to be the read value + + PRINT0(LD_IO, "call omp_set_lock()\n"); +} + +EXTERN void omp_unset_lock(omp_lock_t *lock) { + int compare = SET; + int val = UNSET; + int old = atomicCAS(lock, compare, val); + + PRINT0(LD_IO, "call omp_unset_lock()\n"); +} + +EXTERN int omp_test_lock(omp_lock_t *lock) { + // int atomicCAS(int* address, int compare, int val); + // (old == compare ? val : old) + int compare = UNSET; + int val = SET; + + int ret = atomicCAS(lock, compare, val); + + PRINT(LD_IO, "call omp_test_lock() return %d\n", ret); + + return ret; +} + +// for xlf Fotran +// Fotran, the return is LOGICAL type + +#define FLOGICAL long +EXTERN FLOGICAL __xlf_omp_is_initial_device_i8() { + int ret = omp_is_initial_device(); + if (ret == 0) + return (FLOGICAL)0; + else + return (FLOGICAL)1; +} + +EXTERN int __xlf_omp_is_initial_device_i4() { + int ret = omp_is_initial_device(); + if (ret == 0) + return 0; + else + return 1; +} + +EXTERN long __xlf_omp_get_team_num_i4() { + int ret = omp_get_team_num(); + return (long)ret; +} + +EXTERN long __xlf_omp_get_num_teams_i4() { + int ret = omp_get_num_teams(); + return (long)ret; +} + +EXTERN void xlf_debug_print_int(int *p) { + printf("xlf DEBUG %d): %p %d\n", omp_get_team_num(), p, p == 0 ? 0 : *p); +} + +EXTERN void xlf_debug_print_long(long *p) { + printf("xlf DEBUG %d): %p %ld\n", omp_get_team_num(), p, p == 0 ? 0 : *p); +} + +EXTERN void xlf_debug_print_float(float *p) { + printf("xlf DEBUG %d): %p %f\n", omp_get_team_num(), p, p == 0 ? 0 : *p); +} + +EXTERN void xlf_debug_print_double(double *p) { + printf("xlf DEBUG %d): %p %f\n", omp_get_team_num(), p, p == 0 ? 0 : *p); +} + +EXTERN void xlf_debug_print_addr(void *p) { + printf("xlf DEBUG %d): %p \n", omp_get_team_num(), p); +} Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu @@ -0,0 +1,772 @@ +//===------------ loop.cu - NVPTX OpenMP loop constructs --------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of the KMPC interface +// for the loop construct plus other worksharing constructs that use the same +// interface as loops. +// +//===----------------------------------------------------------------------===// + +#include "omptarget-nvptx.h" + +//////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// +// template class that encapsulate all the helper functions +// +// T is loop iteration type (32 | 64) (unsigned | signed) +// ST is the signed version of T +//////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// + +template class omptarget_nvptx_LoopSupport { +public: + //////////////////////////////////////////////////////////////////////////////// + // Loop with static scheduling with chunk + + // Generic implementation of OMP loop scheduling with static policy + /*! \brief Calculate initial bounds for static loop and stride + * @param[in] loc location in code of the call (not used here) + * @param[in] global_tid global thread id + * @param[in] schetype type of scheduling (see omptarget-nvptx.h) + * @param[in] plastiter pointer to last iteration + * @param[in,out] pointer to loop lower bound. it will contain value of + * lower bound of first chunk + * @param[in,out] pointer to loop upper bound. It will contain value of + * upper bound of first chunk + * @param[in,out] pointer to loop stride. It will contain value of stride + * between two successive chunks executed by the same thread + * @param[in] loop increment bump + * @param[in] chunk size + */ + + // helper function for static chunk + INLINE static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, + ST chunk, T entityId, T numberOfEntities) { + // each thread executes multiple chunks all of the same size, except + // the last one + + // distance between two successive chunks + stride = numberOfEntities * chunk; + lb = lb + entityId * chunk; + T inputUb = ub; + ub = lb + chunk - 1; // Clang uses i <= ub + // Say ub' is the begining of the last chunk. Then who ever has a + // lower bound plus a multiple of the increment equal to ub' is + // the last one. + T beginingLastChunk = inputUb - (inputUb % chunk); + last = ((beginingLastChunk - lb) % stride) == 0; + } + + //////////////////////////////////////////////////////////////////////////////// + // Loop with static scheduling without chunk + + // helper function for static no chunk + INLINE static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, + ST &chunk, T entityId, + T numberOfEntities) { + // No chunk size specified. Each thread or warp gets at most one + // chunk; chunks are all almost of equal size + T loopSize = ub - lb + 1; + + chunk = loopSize / numberOfEntities; + T leftOver = loopSize - chunk * numberOfEntities; + + if (entityId < leftOver) { + chunk++; + lb = lb + entityId * chunk; + } else { + lb = lb + entityId * chunk + leftOver; + } + + T inputUb = ub; + ub = lb + chunk - 1; // Clang uses i <= ub + last = ub == inputUb; + stride = loopSize; // make sure we only do 1 chunk per warp + } + + //////////////////////////////////////////////////////////////////////////////// + // Support for Static Init + + INLINE static void for_static_init(int32_t schedtype, int32_t *plastiter, + T *plower, T *pupper, ST *pstride, + ST chunk, bool IsSPMDExecutionMode, + bool IsOMPRuntimeUnavailable = false) { + // When IsOMPRuntimeUnavailable is true, we assume that the caller is + // in an L0 parallel region and that all worker threads participate. + + int tid = GetLogicalThreadIdInBlock(); + + // Assume we are in teams region or that we use a single block + // per target region + ST numberOfActiveOMPThreads = GetNumberOfOmpThreads( + tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable); + + // All warps that are in excess of the maximum requested, do + // not execute the loop + PRINT(LD_LOOP, + "OMP Thread %d: schedule type %d, chunk size = %lld, mytid " + "%d, num tids %d\n", + GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable), + schedtype, P64(chunk), + GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable), + GetNumberOfOmpThreads(tid, IsSPMDExecutionMode, + IsOMPRuntimeUnavailable)); + ASSERT0( + LT_FUSSY, + (GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable)) < + (GetNumberOfOmpThreads(tid, IsSPMDExecutionMode, + IsOMPRuntimeUnavailable)), + "current thread is not needed here; error"); + + // copy + int lastiter = 0; + T lb = *plower; + T ub = *pupper; + ST stride = *pstride; + T entityId, numberOfEntities; + // init + switch (schedtype) { + case kmp_sched_static_chunk: { + if (chunk > 0) { + entityId = + GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable); + numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode, + IsOMPRuntimeUnavailable); + ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId, + numberOfEntities); + break; + } + } // note: if chunk <=0, use nochunk + case kmp_sched_static_nochunk: { + entityId = + GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable); + numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode, + IsOMPRuntimeUnavailable); + ForStaticNoChunk(lastiter, lb, ub, stride, chunk, entityId, + numberOfEntities); + break; + } + case kmp_sched_distr_static_chunk: { + if (chunk > 0) { + entityId = GetOmpTeamId(); + numberOfEntities = GetNumberOfOmpTeams(); + ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId, + numberOfEntities); + break; + } // note: if chunk <=0, use nochunk + } + case kmp_sched_distr_static_nochunk: { + entityId = GetOmpTeamId(); + numberOfEntities = GetNumberOfOmpTeams(); + + ForStaticNoChunk(lastiter, lb, ub, stride, chunk, entityId, + numberOfEntities); + break; + } + case kmp_sched_distr_static_chunk_sched_static_chunkone: { + entityId = + GetNumberOfOmpThreads(tid, IsSPMDExecutionMode, + IsOMPRuntimeUnavailable) * + GetOmpTeamId() + + GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable); + numberOfEntities = GetNumberOfOmpTeams() * + GetNumberOfOmpThreads(tid, IsSPMDExecutionMode, + IsOMPRuntimeUnavailable); + ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId, + numberOfEntities); + break; + } + default: { + ASSERT(LT_FUSSY, FALSE, "unknown schedtype %d", schedtype); + PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n", + schedtype); + entityId = + GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable); + numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode, + IsOMPRuntimeUnavailable); + ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId, + numberOfEntities); + } + } + // copy back + *plastiter = lastiter; + *plower = lb; + *pupper = ub; + *pstride = stride; + PRINT(LD_LOOP, + "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld\n", + GetNumberOfOmpThreads(tid, IsSPMDExecutionMode, + IsOMPRuntimeUnavailable), + GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper), + P64(*pstride)); + } + + //////////////////////////////////////////////////////////////////////////////// + // Support for dispatch Init + + INLINE static int OrderedSchedule(kmp_sched_t schedule) { + return schedule >= kmp_sched_ordered_first && + schedule <= kmp_sched_ordered_last; + } + + INLINE static void dispatch_init(kmp_sched_t schedule, T lb, T ub, ST st, + ST chunk) { + int tid = GetLogicalThreadIdInBlock(); + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid); + T tnum = currTaskDescr->ThreadsInTeam(); + T tripCount = ub - lb + 1; // +1 because ub is inclusive + ASSERT0( + LT_FUSSY, + GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()) < + GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()), + "current thread is not needed here; error"); + + /* Currently just ignore the monotonic and non-monotonic modifiers + * (the compiler isn't producing them * yet anyway). + * When it is we'll want to look at them somewhere here and use that + * information to add to our schedule choice. We shouldn't need to pass + * them on, they merely affect which schedule we can legally choose for + * various dynamic cases. (In paritcular, whether or not a stealing scheme + * is legal). + */ + schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); + + // Process schedule. + if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) { + PRINT(LD_LOOP, + "go sequential as tnum=%d, trip count %lld, ordered sched=%d\n", + tnum, P64(tripCount), schedule); + schedule = kmp_sched_static_chunk; + chunk = tripCount; // one thread gets the whole loop + + } else if (schedule == kmp_sched_runtime) { + // process runtime + omp_sched_t rtSched = currTaskDescr->GetRuntimeSched(); + chunk = currTaskDescr->RuntimeChunkSize(); + switch (rtSched) { + case omp_sched_static: { + if (chunk > 0) + schedule = kmp_sched_static_chunk; + else + schedule = kmp_sched_static_nochunk; + break; + } + case omp_sched_auto: { + schedule = kmp_sched_static_chunk; + chunk = 1; + break; + } + case omp_sched_dynamic: + case omp_sched_guided: { + schedule = kmp_sched_dynamic; + break; + } + } + PRINT(LD_LOOP, "Runtime sched is %d with chunk %lld\n", schedule, + P64(chunk)); + } else if (schedule == kmp_sched_auto) { + schedule = kmp_sched_static_chunk; + chunk = 1; + PRINT(LD_LOOP, "Auto sched is %d with chunk %lld\n", schedule, + P64(chunk)); + } else { + PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", schedule, P64(chunk)); + ASSERT(LT_FUSSY, + schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, + "unknown schedule %d & chunk %lld\n", schedule, P64(chunk)); + } + + // save sched state + omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; + omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; + + // init schedules + if (schedule == kmp_sched_static_chunk) { + ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); + // save ub + omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; + // compute static chunk + ST stride; + T threadId = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()); + int lastiter = 0; + ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); + // save computed params + omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; + omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; + omptarget_nvptx_threadPrivateContext->Stride(tid) = stride; + PRINT(LD_LOOP, + "dispatch init (static chunk) : num threads = %d, ub = %lld," + "next lower bound = %lld, stride = %lld\n", + GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()), + omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), + omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), + omptarget_nvptx_threadPrivateContext->Stride(tid)); + + } else if (schedule == kmp_sched_static_nochunk) { + ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value"); + // save ub + omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; + // compute static chunk + ST stride; + T threadId = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()); + int lastiter = 0; + ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); + // save computed params + omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; + omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; + omptarget_nvptx_threadPrivateContext->Stride(tid) = stride; + PRINT(LD_LOOP, + "dispatch init (static nochunk) : num threads = %d, ub = %lld," + "next lower bound = %lld, stride = %lld\n", + GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()), + omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), + omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), + omptarget_nvptx_threadPrivateContext->Stride(tid)); + + } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) { + if (chunk < 1) + chunk = 1; + Counter eventNum = ((tripCount - 1) / chunk) + 1; // number of chunks + // but each thread (but one) must discover that it is last + eventNum += tnum; + omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; + omptarget_nvptx_threadPrivateContext->EventsNumber(tid) = eventNum; + PRINT(LD_LOOP, + "dispatch init (dyn) : num threads = %d, ub = %lld, chunk %lld, " + "events number = %lld\n", + GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()), + omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), + omptarget_nvptx_threadPrivateContext->Chunk(tid), + omptarget_nvptx_threadPrivateContext->EventsNumber(tid)); + } + } + + //////////////////////////////////////////////////////////////////////////////// + // Support for dispatch next + + INLINE static int DynamicNextChunk(omptarget_nvptx_CounterGroup &cg, + Counter priv, T &lb, T &ub, + Counter &chunkId, Counter ¤tEvent, + T chunkSize, T loopUpperBound) { + // get next event atomically + Counter nextEvent = cg.Next(); + // calculate chunk Id (priv was initialized upon entering the loop to + // 'start' == 'event') + chunkId = nextEvent - priv; + // calculate lower bound for all lanes in the warp + lb = chunkId * chunkSize; // this code assume normalization of LB + ub = lb + chunkSize - 1; // Clang uses i <= ub + + // 3 result cases: + // a. lb and ub < loopUpperBound --> NOT_FINISHED + // b. lb < loopUpperBound and ub >= loopUpperBound: last chunk --> + // NOT_FINISHED + // c. lb and ub >= loopUpperBound: empty chunk --> FINISHED + currentEvent = nextEvent; + // a. + if (ub <= loopUpperBound) { + PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n", P64(lb), + P64(ub), P64(loopUpperBound)); + return NOT_FINISHED; + } + // b. + if (lb <= loopUpperBound) { + PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; clip to loop ub\n", + P64(lb), P64(ub), P64(loopUpperBound)); + ub = loopUpperBound; + return LAST_CHUNK; + } + // c. if we are here, we are in case 'c' + lb = loopUpperBound + 1; + PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", P64(lb), + P64(ub), P64(loopUpperBound)); + return FINISHED; + } + + // On Pascal, with inlining of the runtime into the user application, + // this code deadlocks. This is probably because different threads + // in a warp cannot make independent progress. + NOINLINE static int dispatch_next(int32_t *plast, T *plower, T *pupper, + ST *pstride) { + // ID of a thread in its own warp + + // automatically selects thread or warp ID based on selected implementation + int tid = GetLogicalThreadIdInBlock(); + ASSERT0( + LT_FUSSY, + GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()) < + GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()), + "current thread is not needed here; error"); + // retrieve schedule + kmp_sched_t schedule = + omptarget_nvptx_threadPrivateContext->ScheduleType(tid); + + // xxx reduce to one + if (schedule == kmp_sched_static_chunk || + schedule == kmp_sched_static_nochunk) { + T myLb = omptarget_nvptx_threadPrivateContext->NextLowerBound(tid); + T ub = omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid); + // finished? + if (myLb > ub) { + PRINT(LD_LOOP, "static loop finished with myLb %lld, ub %lld\n", + P64(myLb), P64(ub)); + return DISPATCH_FINISHED; + } + // not finished, save current bounds + ST chunk = omptarget_nvptx_threadPrivateContext->Chunk(tid); + *plower = myLb; + T myUb = myLb + chunk - 1; // Clang uses i <= ub + if (myUb > ub) + myUb = ub; + *pupper = myUb; + *plast = (int32_t)(myUb == ub); + + // increment next lower bound by the stride + ST stride = omptarget_nvptx_threadPrivateContext->Stride(tid); + omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = myLb + stride; + PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n", + P64(*plower), P64(*pupper)); + return DISPATCH_NOTFINISHED; + } + ASSERT0(LT_FUSSY, + schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, + "bad sched"); + omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor(); + T myLb, myUb; + Counter chunkId; + // xxx current event is now local + omptarget_nvptx_CounterGroup &cg = teamDescr.WorkDescr().CounterGroup(); + int finished = DynamicNextChunk( + cg, omptarget_nvptx_threadPrivateContext->Priv(tid), myLb, myUb, + chunkId, omptarget_nvptx_threadPrivateContext->CurrentEvent(tid), + omptarget_nvptx_threadPrivateContext->Chunk(tid), + omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid)); + + if (finished == FINISHED) { + cg.Complete(omptarget_nvptx_threadPrivateContext->Priv(tid), + omptarget_nvptx_threadPrivateContext->EventsNumber(tid)); + cg.Release(omptarget_nvptx_threadPrivateContext->Priv(tid), + omptarget_nvptx_threadPrivateContext->CurrentEvent(tid)); + + return DISPATCH_FINISHED; + } + + // not finished (either not finished or last chunk) + *plast = (int32_t)( + myUb == omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid)); + *plower = myLb; + *pupper = myUb; + *pstride = 1; + + PRINT(LD_LOOP, + "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld\n", + GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()), + GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper), + P64(*pstride)); + return DISPATCH_NOTFINISHED; + } + + INLINE static void dispatch_fini() { + // nothing + } + + //////////////////////////////////////////////////////////////////////////////// + // end of template class that encapsulate all the helper functions + //////////////////////////////////////////////////////////////////////////////// +}; + +//////////////////////////////////////////////////////////////////////////////// +// KMP interface implementation (dyn loops) +//////////////////////////////////////////////////////////////////////////////// + +// init +EXTERN void __kmpc_dispatch_init_4(kmp_Indent *loc, int32_t tid, + int32_t schedule, int32_t lb, int32_t ub, + int32_t st, int32_t chunk) { + PRINT0(LD_IO, "call kmpc_dispatch_init_4\n"); + omptarget_nvptx_LoopSupport::dispatch_init( + (kmp_sched_t)schedule, lb, ub, st, chunk); +} + +EXTERN void __kmpc_dispatch_init_4u(kmp_Indent *loc, int32_t tid, + int32_t schedule, uint32_t lb, uint32_t ub, + int32_t st, int32_t chunk) { + PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n"); + omptarget_nvptx_LoopSupport::dispatch_init( + (kmp_sched_t)schedule, lb, ub, st, chunk); +} + +EXTERN void __kmpc_dispatch_init_8(kmp_Indent *loc, int32_t tid, + int32_t schedule, int64_t lb, int64_t ub, + int64_t st, int64_t chunk) { + PRINT0(LD_IO, "call kmpc_dispatch_init_8\n"); + omptarget_nvptx_LoopSupport::dispatch_init( + (kmp_sched_t)schedule, lb, ub, st, chunk); +} + +EXTERN void __kmpc_dispatch_init_8u(kmp_Indent *loc, int32_t tid, + int32_t schedule, uint64_t lb, uint64_t ub, + int64_t st, int64_t chunk) { + PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n"); + omptarget_nvptx_LoopSupport::dispatch_init( + (kmp_sched_t)schedule, lb, ub, st, chunk); +} + +// next +EXTERN int __kmpc_dispatch_next_4(kmp_Indent *loc, int32_t tid, int32_t *p_last, + int32_t *p_lb, int32_t *p_ub, int32_t *p_st) { + PRINT0(LD_IO, "call kmpc_dispatch_next_4\n"); + return omptarget_nvptx_LoopSupport::dispatch_next( + p_last, p_lb, p_ub, p_st); +} + +EXTERN int __kmpc_dispatch_next_4u(kmp_Indent *loc, int32_t tid, + int32_t *p_last, uint32_t *p_lb, + uint32_t *p_ub, int32_t *p_st) { + PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n"); + return omptarget_nvptx_LoopSupport::dispatch_next( + p_last, p_lb, p_ub, p_st); +} + +EXTERN int __kmpc_dispatch_next_8(kmp_Indent *loc, int32_t tid, int32_t *p_last, + int64_t *p_lb, int64_t *p_ub, int64_t *p_st) { + PRINT0(LD_IO, "call kmpc_dispatch_next_8\n"); + return omptarget_nvptx_LoopSupport::dispatch_next( + p_last, p_lb, p_ub, p_st); +} + +EXTERN int __kmpc_dispatch_next_8u(kmp_Indent *loc, int32_t tid, + int32_t *p_last, uint64_t *p_lb, + uint64_t *p_ub, int64_t *p_st) { + PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n"); + return omptarget_nvptx_LoopSupport::dispatch_next( + p_last, p_lb, p_ub, p_st); +} + +// fini +EXTERN void __kmpc_dispatch_fini_4(kmp_Indent *loc, int32_t tid) { + PRINT0(LD_IO, "call kmpc_dispatch_fini_4\n"); + omptarget_nvptx_LoopSupport::dispatch_fini(); +} + +EXTERN void __kmpc_dispatch_fini_4u(kmp_Indent *loc, int32_t tid) { + PRINT0(LD_IO, "call kmpc_dispatch_fini_4u\n"); + omptarget_nvptx_LoopSupport::dispatch_fini(); +} + +EXTERN void __kmpc_dispatch_fini_8(kmp_Indent *loc, int32_t tid) { + PRINT0(LD_IO, "call kmpc_dispatch_fini_8\n"); + omptarget_nvptx_LoopSupport::dispatch_fini(); +} + +EXTERN void __kmpc_dispatch_fini_8u(kmp_Indent *loc, int32_t tid) { + PRINT0(LD_IO, "call kmpc_dispatch_fini_8u\n"); + omptarget_nvptx_LoopSupport::dispatch_fini(); +} + +//////////////////////////////////////////////////////////////////////////////// +// KMP interface implementation (static loops) +//////////////////////////////////////////////////////////////////////////////// + +EXTERN void __kmpc_for_static_init_4(kmp_Indent *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, + int32_t *plower, int32_t *pupper, + int32_t *pstride, int32_t incr, + int32_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_4\n"); + omptarget_nvptx_LoopSupport::for_static_init( + schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode()); +} + +EXTERN void __kmpc_for_static_init_4u(kmp_Indent *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, + uint32_t *plower, uint32_t *pupper, + int32_t *pstride, int32_t incr, + int32_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_4u\n"); + omptarget_nvptx_LoopSupport::for_static_init( + schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode()); +} + +EXTERN void __kmpc_for_static_init_8(kmp_Indent *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, + int64_t *plower, int64_t *pupper, + int64_t *pstride, int64_t incr, + int64_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_8\n"); + omptarget_nvptx_LoopSupport::for_static_init( + schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode()); +} + +EXTERN void __kmpc_for_static_init_8u(kmp_Indent *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, + uint64_t *plower, uint64_t *pupper, + int64_t *pstride, int64_t incr, + int64_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_8u\n"); + omptarget_nvptx_LoopSupport::for_static_init( + schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode()); +} + +EXTERN +void __kmpc_for_static_init_4_simple_spmd(kmp_Indent *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, + int32_t *plower, int32_t *pupper, + int32_t *pstride, int32_t incr, + int32_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_spmd\n"); + omptarget_nvptx_LoopSupport::for_static_init( + schedtype, plastiter, plower, pupper, pstride, chunk, + /*isSPMDExecutionMode=*/true, + /*IsOMPRuntimeUnavailable=*/true); +} + +EXTERN +void __kmpc_for_static_init_4u_simple_spmd(kmp_Indent *loc, int32_t global_tid, + int32_t schedtype, + int32_t *plastiter, uint32_t *plower, + uint32_t *pupper, int32_t *pstride, + int32_t incr, int32_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_spmd\n"); + omptarget_nvptx_LoopSupport::for_static_init( + schedtype, plastiter, plower, pupper, pstride, chunk, + /*isSPMDExecutionMode=*/true, + /*IsOMPRuntimeUnavailable=*/true); +} + +EXTERN +void __kmpc_for_static_init_8_simple_spmd(kmp_Indent *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, + int64_t *plower, int64_t *pupper, + int64_t *pstride, int64_t incr, + int64_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_spmd\n"); + omptarget_nvptx_LoopSupport::for_static_init( + schedtype, plastiter, plower, pupper, pstride, chunk, + /*isSPMDExecutionMode=*/true, + /*IsOMPRuntimeUnavailable=*/true); +} + +EXTERN +void __kmpc_for_static_init_8u_simple_spmd(kmp_Indent *loc, int32_t global_tid, + int32_t schedtype, + int32_t *plastiter, uint64_t *plower, + uint64_t *pupper, int64_t *pstride, + int64_t incr, int64_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_spmd\n"); + omptarget_nvptx_LoopSupport::for_static_init( + schedtype, plastiter, plower, pupper, pstride, chunk, + /*isSPMDExecutionMode=*/true, + /*IsOMPRuntimeUnavailable=*/true); +} + +EXTERN +void __kmpc_for_static_init_4_simple_generic( + kmp_Indent *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, + int32_t *plower, int32_t *pupper, int32_t *pstride, int32_t incr, + int32_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n"); + omptarget_nvptx_LoopSupport::for_static_init( + schedtype, plastiter, plower, pupper, pstride, chunk, + /*isSPMDExecutionMode=*/false, + /*IsOMPRuntimeUnavailable=*/true); +} + +EXTERN +void __kmpc_for_static_init_4u_simple_generic( + kmp_Indent *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, + uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr, + int32_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n"); + omptarget_nvptx_LoopSupport::for_static_init( + schedtype, plastiter, plower, pupper, pstride, chunk, + /*isSPMDExecutionMode=*/false, + /*IsOMPRuntimeUnavailable=*/true); +} + +EXTERN +void __kmpc_for_static_init_8_simple_generic( + kmp_Indent *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, + int64_t *plower, int64_t *pupper, int64_t *pstride, int64_t incr, + int64_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n"); + omptarget_nvptx_LoopSupport::for_static_init( + schedtype, plastiter, plower, pupper, pstride, chunk, + /*isSPMDExecutionMode=*/false, + /*IsOMPRuntimeUnavailable=*/true); +} + +EXTERN +void __kmpc_for_static_init_8u_simple_generic( + kmp_Indent *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, + uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr, + int64_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n"); + omptarget_nvptx_LoopSupport::for_static_init( + schedtype, plastiter, plower, pupper, pstride, chunk, + /*isSPMDExecutionMode=*/false, + /*IsOMPRuntimeUnavailable=*/true); +} + +EXTERN void __kmpc_for_static_fini(kmp_Indent *loc, int32_t global_tid) { + PRINT0(LD_IO, "call kmpc_for_static_fini\n"); +} + +namespace { +INLINE void syncWorkersInGenericMode(uint32_t NumThreads) { + int NumWarps = ((NumThreads + WARPSIZE - 1) / WARPSIZE); +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + // On Volta and newer architectures we require that all lanes in + // a warp (at least, all present for the kernel launch) participate in the + // barrier. This is enforced when launching the parallel region. An + // exception is when there are < WARPSIZE workers. In this case only 1 worker + // is started, so we don't need a barrier. + if (NumThreads > 1) { +#endif + named_sync(L1_BARRIER, WARPSIZE * NumWarps); +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + } +#endif +} +}; // namespace + +EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Indent *loc, int32_t gtid, + int32_t varNum, void *array) { + PRINT0(LD_IO, "call to __kmpc_reduce_conditional_lastprivate(...)\n"); + + omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor(); + int tid = GetOmpThreadId(GetLogicalThreadIdInBlock(), isSPMDMode(), + isRuntimeUninitialized()); + uint32_t NumThreads = GetNumberOfOmpThreads( + GetLogicalThreadIdInBlock(), isSPMDMode(), isRuntimeUninitialized()); + uint64_t *Buffer = teamDescr.getLastprivateIterBuffer(); + for (unsigned i = 0; i < varNum; i++) { + // Reset buffer. + if (tid == 0) + *Buffer = 0; // Reset to minimum loop iteration value. + + // Barrier. + syncWorkersInGenericMode(NumThreads); + + // Atomic max of iterations. + uint64_t *varArray = (uint64_t *)array; + uint64_t elem = varArray[i]; + (void)atomicMax((unsigned long long int *)Buffer, + (unsigned long long int)elem); + + // Barrier. + syncWorkersInGenericMode(NumThreads); + + // Read max value and update thread private array. + varArray[i] = *Buffer; + + // Barrier. + syncWorkersInGenericMode(NumThreads); + } +} Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu @@ -0,0 +1,48 @@ +//===------------ omp_data.cu - NVPTX OpenMP GPU objects --------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the data objects used on the GPU device. +// +//===----------------------------------------------------------------------===// + +#include "omptarget-nvptx.h" + +//////////////////////////////////////////////////////////////////////////////// +// global data holding OpenMP state information +//////////////////////////////////////////////////////////////////////////////// + +__device__ + omptarget_nvptx_Queue + omptarget_nvptx_device_State[MAX_SM]; + +// Pointer to this team's OpenMP state object +__device__ __shared__ + omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; + +//////////////////////////////////////////////////////////////////////////////// +// The team master sets the outlined parallel function in this variable to +// communicate with the workers. Since it is in shared memory, there is one +// copy of these variables for each kernel, instance, and team. +//////////////////////////////////////////////////////////////////////////////// +volatile __device__ __shared__ omptarget_nvptx_WorkFn omptarget_nvptx_workFn; + +//////////////////////////////////////////////////////////////////////////////// +// OpenMP kernel execution parameters +//////////////////////////////////////////////////////////////////////////////// +__device__ __shared__ uint32_t execution_param; + +//////////////////////////////////////////////////////////////////////////////// +// Data sharing state +//////////////////////////////////////////////////////////////////////////////// +__device__ __shared__ DataSharingStateTy DataSharingState; + +//////////////////////////////////////////////////////////////////////////////// +// Scratchpad for teams reduction. +//////////////////////////////////////////////////////////////////////////////// +__device__ __shared__ void *ReductionScratchpadPtr; Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -0,0 +1,362 @@ +//===---- omptarget-nvptx.h - NVPTX OpenMP GPU initialization ---- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declarations of all library macros, types, +// and functions. +// +//===----------------------------------------------------------------------===// + +#ifndef __OMPTARGET_NVPTX_H +#define __OMPTARGET_NVPTX_H + +// std includes +#include +#include + +// cuda includes +#include +#include + +// local includes +#include "counter_group.h" +#include "debug.h" // debug +#include "interface.h" // interfaces with omp, compiler, and user +#include "option.h" // choices we have +#include "state-queue.h" +#include "support.h" + +#define OMPTARGET_NVPTX_VERSION 1.1 + +// used by the library for the interface with the app +#define DISPATCH_FINISHED 0 +#define DISPATCH_NOTFINISHED 1 + +// used by dynamic scheduling +#define FINISHED 0 +#define NOT_FINISHED 1 +#define LAST_CHUNK 2 + +#define BARRIER_COUNTER 0 +#define ORDERED_COUNTER 1 + +// Macros for Cuda intrinsics +// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'. +// Also, __ballot(1) in Cuda 8.0 is replaced with __activemask(). +#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000 +#define __SHFL_SYNC(mask, var, srcLane) __shfl_sync((mask), (var), (srcLane)) +#define __SHFL_DOWN_SYNC(mask, var, delta, width) \ + __shfl_down_sync((mask), (var), (delta), (width)) +#define __BALLOT_SYNC(mask, predicate) __ballot_sync((mask), (predicate)) +#define __ACTIVEMASK() __activemask() +#else +#define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane)) +#define __SHFL_DOWN_SYNC(mask, var, delta, width) \ + __shfl_down((var), (delta), (width)) +#define __BALLOT_SYNC(mask, predicate) __ballot((predicate)) +#define __ACTIVEMASK() __ballot(1) +#endif + +// Data sharing related quantities, need to match what is used in the compiler. +enum DATA_SHARING_SIZES { + // The maximum number of workers in a kernel. + DS_Max_Worker_Threads = 992, + // The size reserved for data in a shared memory slot. + DS_Slot_Size = 256, + // The slot size that should be reserved for a working warp. + DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size, + // The maximum number of warps in use + DS_Max_Warp_Number = 32, +}; + +// Data structure to keep in shared memory that traces the current slot, stack, +// and frame pointer as well as the active threads that didn't exit the current +// environment. +struct DataSharingStateTy { + __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number]; + void *StackPtr[DS_Max_Warp_Number]; + void *FramePtr[DS_Max_Warp_Number]; + int32_t ActiveThreads[DS_Max_Warp_Number]; +}; +// Additional worker slot type which is initialized with the default worker slot +// size of 4*32 bytes. +struct __kmpc_data_sharing_worker_slot_static { + __kmpc_data_sharing_slot *Next; + void *DataEnd; + char Data[DS_Worker_Warp_Slot_Size]; +}; +// Additional master slot type which is initialized with the default master slot +// size of 4 bytes. +struct __kmpc_data_sharing_master_slot_static { + __kmpc_data_sharing_slot *Next; + void *DataEnd; + char Data[DS_Slot_Size]; +}; +extern __device__ __shared__ DataSharingStateTy DataSharingState; + +//////////////////////////////////////////////////////////////////////////////// +// task ICV and (implicit & explicit) task state + +class omptarget_nvptx_TaskDescr { +public: + // methods for flags + INLINE omp_sched_t GetRuntimeSched(); + INLINE void SetRuntimeSched(omp_sched_t sched); + INLINE int IsDynamic() { return data.items.flags & TaskDescr_IsDynamic; } + INLINE void SetDynamic() { + data.items.flags = data.items.flags | TaskDescr_IsDynamic; + } + INLINE void ClearDynamic() { + data.items.flags = data.items.flags & (~TaskDescr_IsDynamic); + } + INLINE int InParallelRegion() { return data.items.flags & TaskDescr_InPar; } + INLINE int InL2OrHigherParallelRegion() { + return data.items.flags & TaskDescr_InParL2P; + } + INLINE int IsParallelConstruct() { + return data.items.flags & TaskDescr_IsParConstr; + } + INLINE int IsTaskConstruct() { return !IsParallelConstruct(); } + // methods for other fields + INLINE uint16_t &NThreads() { return data.items.nthreads; } + INLINE uint16_t &ThreadLimit() { return data.items.threadlimit; } + INLINE uint16_t &ThreadId() { return data.items.threadId; } + INLINE uint16_t &ThreadsInTeam() { return data.items.threadsInTeam; } + INLINE uint64_t &RuntimeChunkSize() { return data.items.runtimeChunkSize; } + INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() { return prev; } + INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) { + prev = taskDescr; + } + // init & copy + INLINE void InitLevelZeroTaskDescr(); + INLINE void InitLevelOneTaskDescr(uint16_t tnum, + omptarget_nvptx_TaskDescr *parentTaskDescr); + INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr); + INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr); + INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr); + INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr); + INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr, + uint16_t tnum); + INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr); + INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr, + uint16_t tid, uint16_t tnum); + +private: + // bits for flags: (7 used, 1 free) + // 3 bits (SchedMask) for runtime schedule + // 1 bit (IsDynamic) for dynamic schedule (false = static) + // 1 bit (InPar) if this thread has encountered one or more parallel region + // 1 bit (IsParConstr) if ICV for a parallel region (false = explicit task) + // 1 bit (InParL2+) if this thread has encountered L2 or higher parallel + // region + static const uint8_t TaskDescr_SchedMask = (0x1 | 0x2 | 0x4); + static const uint8_t TaskDescr_IsDynamic = 0x8; + static const uint8_t TaskDescr_InPar = 0x10; + static const uint8_t TaskDescr_IsParConstr = 0x20; + static const uint8_t TaskDescr_InParL2P = 0x40; + + union { // both have same size + uint64_t vect[2]; + struct TaskDescr_items { + uint8_t flags; // 6 bit used (see flag above) + uint8_t unused; + uint16_t nthreads; // thread num for subsequent parallel regions + uint16_t threadlimit; // thread limit ICV + uint16_t threadId; // thread id + uint16_t threadsInTeam; // threads in current team + uint64_t runtimeChunkSize; // runtime chunk size + } items; + } data; + omptarget_nvptx_TaskDescr *prev; +}; + +// build on kmp +typedef struct omptarget_nvptx_ExplicitTaskDescr { + omptarget_nvptx_TaskDescr + taskDescr; // omptarget_nvptx task description (must be first) + kmp_TaskDescr kmpTaskDescr; // kmp task description (must be last) +} omptarget_nvptx_ExplicitTaskDescr; + +//////////////////////////////////////////////////////////////////////////////// +// Descriptor of a parallel region (worksharing in general) + +class omptarget_nvptx_WorkDescr { + +public: + // access to data + INLINE omptarget_nvptx_CounterGroup &CounterGroup() { return cg; } + INLINE omptarget_nvptx_TaskDescr *WorkTaskDescr() { return &masterTaskICV; } + // init + INLINE void InitWorkDescr(); + +private: + omptarget_nvptx_CounterGroup cg; // for barrier (no other needed) + omptarget_nvptx_TaskDescr masterTaskICV; + bool hasCancel; +}; + +//////////////////////////////////////////////////////////////////////////////// + +class omptarget_nvptx_TeamDescr { +public: + // access to data + INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() { + return &levelZeroTaskDescr; + } + INLINE omptarget_nvptx_WorkDescr &WorkDescr() { + return workDescrForActiveParallel; + } + INLINE omp_lock_t *CriticalLock() { return &criticalLock; } + INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; } + + // init + INLINE void InitTeamDescr(); + + INLINE __kmpc_data_sharing_slot *RootS(int wid) { + // If this is invoked by the master thread of the master warp then intialize + // it with a smaller slot. + if (wid == WARPSIZE - 1) { + // Initialize the pointer to the end of the slot given the size of the + // data section. DataEnd is non-inclusive. + master_rootS[0].DataEnd = &master_rootS[0].Data[0] + DS_Slot_Size; + // We currently do not have a next slot. + master_rootS[0].Next = 0; + return (__kmpc_data_sharing_slot *)&master_rootS[0]; + } + // Initialize the pointer to the end of the slot given the size of the data + // section. DataEnd is non-inclusive. + worker_rootS[wid].DataEnd = + &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size; + // We currently do not have a next slot. + worker_rootS[wid].Next = 0; + return (__kmpc_data_sharing_slot *)&worker_rootS[wid]; + } + +private: + omptarget_nvptx_TaskDescr + levelZeroTaskDescr; // icv for team master initial thread + omptarget_nvptx_WorkDescr + workDescrForActiveParallel; // one, ONLY for the active par + omp_lock_t criticalLock; + uint64_t lastprivateIterBuffer; + + __align__(16) + __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE - 1]; + __align__(16) __kmpc_data_sharing_master_slot_static master_rootS[1]; +}; + +//////////////////////////////////////////////////////////////////////////////// +// thread private data (struct of arrays for better coalescing) +// tid refers here to the global thread id +// do not support multiple concurrent kernel a this time +class omptarget_nvptx_ThreadPrivateContext { +public: + // task + INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) { + return &levelOneTaskDescr[tid]; + } + INLINE void SetTopLevelTaskDescr(int tid, + omptarget_nvptx_TaskDescr *taskICV) { + topTaskDescr[tid] = taskICV; + } + INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid); + // parallel + INLINE uint16_t &NumThreadsForNextParallel(int tid) { + return nextRegion.tnum[tid]; + } + // simd + INLINE uint16_t &SimdLimitForNextSimd(int tid) { + return nextRegion.slim[tid]; + } + // sync + INLINE Counter &Priv(int tid) { return priv[tid]; } + INLINE void IncrementPriv(int tid, Counter val) { priv[tid] += val; } + // schedule (for dispatch) + INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; } + INLINE int64_t &Chunk(int tid) { return chunk[tid]; } + INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; } + // state for dispatch with dyn/guided + INLINE Counter &CurrentEvent(int tid) { + return currEvent_or_nextLowerBound[tid]; + } + INLINE Counter &EventsNumber(int tid) { return eventsNum_or_stride[tid]; } + // state for dispatch with static + INLINE Counter &NextLowerBound(int tid) { + return currEvent_or_nextLowerBound[tid]; + } + INLINE Counter &Stride(int tid) { return eventsNum_or_stride[tid]; } + + INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; } + + INLINE void InitThreadPrivateContext(int tid); + INLINE void SetSourceQueue(uint64_t Src) { SourceQueue = Src; } + INLINE uint64_t GetSourceQueue() { return SourceQueue; } + +private: + // team context for this team + omptarget_nvptx_TeamDescr teamContext; + // task ICV for implict threads in the only parallel region + omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM]; + // pointer where to find the current task ICV (top of the stack) + omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM]; + union { + // Only one of the two is live at the same time. + // parallel + uint16_t tnum[MAX_THREADS_PER_TEAM]; + // simd limit + uint16_t slim[MAX_THREADS_PER_TEAM]; + } nextRegion; + // sync + Counter priv[MAX_THREADS_PER_TEAM]; + // schedule (for dispatch) + kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for + int64_t chunk[MAX_THREADS_PER_TEAM]; + int64_t loopUpperBound[MAX_THREADS_PER_TEAM]; + // state for dispatch with dyn/guided OR static (never use both at a time) + Counter currEvent_or_nextLowerBound[MAX_THREADS_PER_TEAM]; + Counter eventsNum_or_stride[MAX_THREADS_PER_TEAM]; + // Queue to which this object must be returned. + uint64_t SourceQueue; +}; + +//////////////////////////////////////////////////////////////////////////////// +// global data tables +//////////////////////////////////////////////////////////////////////////////// + +extern __device__ __shared__ + omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; +extern __device__ __shared__ uint32_t execution_param; +extern __device__ __shared__ void *ReductionScratchpadPtr; + +//////////////////////////////////////////////////////////////////////////////// +// work function (outlined parallel/simd functions) and arguments. +// needed for L1 parallelism only. +//////////////////////////////////////////////////////////////////////////////// + +typedef void *omptarget_nvptx_WorkFn; +extern volatile __device__ __shared__ omptarget_nvptx_WorkFn + omptarget_nvptx_workFn; + +//////////////////////////////////////////////////////////////////////////////// +// get private data structures +//////////////////////////////////////////////////////////////////////////////// + +INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor(); +INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor(); +INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(); +INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId); + +//////////////////////////////////////////////////////////////////////////////// +// inlined implementation +//////////////////////////////////////////////////////////////////////////////// + +#include "counter_groupi.h" +#include "omptarget-nvptxi.h" +#include "supporti.h" + +#endif Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -0,0 +1,188 @@ +//===--- omptarget-nvptx.cu - NVPTX OpenMP GPU initialization ---- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the initialization code for the GPU +// +//===----------------------------------------------------------------------===// + +#include "omptarget-nvptx.h" + +//////////////////////////////////////////////////////////////////////////////// +// global data tables +//////////////////////////////////////////////////////////////////////////////// + +extern __device__ + omptarget_nvptx_Queue + omptarget_nvptx_device_State[MAX_SM]; + +extern __device__ __shared__ + omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; + +// +// The team master sets the outlined function and its arguments in these +// variables to communicate with the workers. Since they are in shared memory, +// there is one copy of these variables for each kernel, instance, and team. +// +extern volatile __device__ __shared__ omptarget_nvptx_WorkFn + omptarget_nvptx_workFn; +extern __device__ __shared__ uint32_t execution_param; + +//////////////////////////////////////////////////////////////////////////////// +// init entry points +//////////////////////////////////////////////////////////////////////////////// + +INLINE unsigned smid() { + unsigned id; + asm("mov.u32 %0, %%smid;" : "=r"(id)); + return id; +} + +EXTERN void __kmpc_kernel_init_params(void *Ptr) { + PRINT(LD_IO, "call to __kmpc_kernel_init_params with version %f\n", + OMPTARGET_NVPTX_VERSION); + + SetTeamsReductionScratchpadPtr(Ptr); +} + +EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) { + PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n", + OMPTARGET_NVPTX_VERSION); + + if (!RequiresOMPRuntime) { + // If OMP runtime is not required don't initialize OMP state. + setExecutionParameters(Generic, RuntimeUninitialized); + return; + } + setExecutionParameters(Generic, RuntimeInitialized); + + int threadIdInBlock = GetThreadIdInBlock(); + ASSERT0(LT_FUSSY, threadIdInBlock == GetMasterThreadID(), + "__kmpc_kernel_init() must be called by team master warp only!"); + PRINT0(LD_IO, "call to __kmpc_kernel_init for master\n"); + + // Get a state object from the queue. + int slot = smid() % MAX_SM; + omptarget_nvptx_threadPrivateContext = + omptarget_nvptx_device_State[slot].Dequeue(); +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + omptarget_nvptx_threadPrivateContext->SetSourceQueue(slot); +#endif + + // init thread private + int threadId = GetLogicalThreadIdInBlock(); + omptarget_nvptx_threadPrivateContext->InitThreadPrivateContext(threadId); + + // init team context + omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor(); + currTeamDescr.InitTeamDescr(); + // this thread will start execution... has to update its task ICV + // to point to the level zero task ICV. That ICV was init in + // InitTeamDescr() + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( + threadId, currTeamDescr.LevelZeroTaskDescr()); + + // set number of threads and thread limit in team to started value + omptarget_nvptx_TaskDescr *currTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); + currTaskDescr->NThreads() = GetNumberOfWorkersInTeam(); + currTaskDescr->ThreadLimit() = ThreadLimit; +} + +EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) { + if (IsOMPRuntimeInitialized) { + // Enqueue omp state object for use by another team. +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + int slot = omptarget_nvptx_threadPrivateContext->GetSourceQueue(); +#else + int slot = smid() % MAX_SM; +#endif + omptarget_nvptx_device_State[slot].Enqueue( + omptarget_nvptx_threadPrivateContext); + } + // Done with work. Kill the workers. + omptarget_nvptx_workFn = 0; +} + +EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime, + int16_t RequiresDataSharing) { + PRINT0(LD_IO, "call to __kmpc_spmd_kernel_init\n"); + + if (!RequiresOMPRuntime) { + // If OMP runtime is not required don't initialize OMP state. + setExecutionParameters(Spmd, RuntimeUninitialized); + return; + } + setExecutionParameters(Spmd, RuntimeInitialized); + + // + // Team Context Initialization. + // + // In SPMD mode there is no master thread so use any cuda thread for team + // context initialization. + int threadId = GetThreadIdInBlock(); + if (threadId == 0) { + // Get a state object from the queue. + int slot = smid() % MAX_SM; + omptarget_nvptx_threadPrivateContext = + omptarget_nvptx_device_State[slot].Dequeue(); + + omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor(); + omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); + // init team context + currTeamDescr.InitTeamDescr(); + // init counters (copy start to init) + workDescr.CounterGroup().Reset(); + } + __syncthreads(); + + omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor(); + omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); + + // + // Initialize task descr for each thread. + // + omptarget_nvptx_TaskDescr *newTaskDescr = + omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId); + ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr"); + newTaskDescr->InitLevelOneTaskDescr(ThreadLimit, + currTeamDescr.LevelZeroTaskDescr()); + newTaskDescr->ThreadLimit() = ThreadLimit; + // install new top descriptor + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, + newTaskDescr); + + // init thread private from init value + workDescr.CounterGroup().Init( + omptarget_nvptx_threadPrivateContext->Priv(threadId)); + PRINT(LD_PAR, + "thread will execute parallel region with id %d in a team of " + "%d threads\n", + newTaskDescr->ThreadId(), newTaskDescr->NThreads()); + + if (RequiresDataSharing && threadId % WARPSIZE == 0) { + // Warp master innitializes data sharing environment. + unsigned WID = threadId / WARPSIZE; + __kmpc_data_sharing_slot *RootS = currTeamDescr.RootS(WID); + DataSharingState.SlotPtr[WID] = RootS; + DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0]; + } +} + +EXTERN void __kmpc_spmd_kernel_deinit() { + // We're not going to pop the task descr stack of each thread since + // there are no more parallel regions in SPMD mode. + __syncthreads(); + int threadId = GetThreadIdInBlock(); + if (threadId == 0) { + // Enqueue omp state object for use by another team. + int slot = smid() % MAX_SM; + omptarget_nvptx_device_State[slot].Enqueue( + omptarget_nvptx_threadPrivateContext); + } +} Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h @@ -0,0 +1,195 @@ +//===---- omptarget-nvptxi.h - NVPTX OpenMP GPU initialization --- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declarations of all library macros, types, +// and functions. +// +//===----------------------------------------------------------------------===// + +//////////////////////////////////////////////////////////////////////////////// +// Task Descriptor +//////////////////////////////////////////////////////////////////////////////// + +INLINE omp_sched_t omptarget_nvptx_TaskDescr::GetRuntimeSched() { + // sched starts from 1..4; encode it as 0..3; so add 1 here + uint8_t rc = (data.items.flags & TaskDescr_SchedMask) + 1; + return (omp_sched_t)rc; +} + +INLINE void omptarget_nvptx_TaskDescr::SetRuntimeSched(omp_sched_t sched) { + // sched starts from 1..4; encode it as 0..3; so sub 1 here + uint8_t val = ((uint8_t)sched) - 1; + // clear current sched + data.items.flags &= ~TaskDescr_SchedMask; + // set new sched + data.items.flags |= val; +} + +INLINE void omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() { + // slow method + // flag: + // default sched is static, + // dyn is off (unused now anyway, but may need to sample from host ?) + // not in parallel + + data.items.flags = 0; + data.items.nthreads = GetNumberOfProcsInTeam(); + ; // threads: whatever was alloc by kernel + data.items.threadId = 0; // is master + data.items.threadsInTeam = 1; // sequential + data.items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1 +} + +// This is called when all threads are started together in SPMD mode. +// OMP directives include target parallel, target distribute parallel for, etc. +INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr( + uint16_t tnum, omptarget_nvptx_TaskDescr *parentTaskDescr) { + // slow method + // flag: + // default sched is static, + // dyn is off (unused now anyway, but may need to sample from host ?) + // in L1 parallel + + data.items.flags = + TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel + data.items.nthreads = 0; // # threads for subsequent parallel region + data.items.threadId = + GetThreadIdInBlock(); // get ids from cuda (only called for 1st level) + data.items.threadsInTeam = tnum; + data.items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1 + prev = parentTaskDescr; +} + +INLINE void omptarget_nvptx_TaskDescr::CopyData( + omptarget_nvptx_TaskDescr *sourceTaskDescr) { + data.vect[0] = sourceTaskDescr->data.vect[0]; + data.vect[1] = sourceTaskDescr->data.vect[1]; +} + +INLINE void +omptarget_nvptx_TaskDescr::Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr) { + CopyData(sourceTaskDescr); + prev = sourceTaskDescr->prev; +} + +INLINE void omptarget_nvptx_TaskDescr::CopyParent( + omptarget_nvptx_TaskDescr *parentTaskDescr) { + CopyData(parentTaskDescr); + prev = parentTaskDescr; +} + +INLINE void omptarget_nvptx_TaskDescr::CopyForExplicitTask( + omptarget_nvptx_TaskDescr *parentTaskDescr) { + CopyParent(parentTaskDescr); + data.items.flags = data.items.flags & ~TaskDescr_IsParConstr; + ASSERT0(LT_FUSSY, IsTaskConstruct(), "expected task"); +} + +INLINE void omptarget_nvptx_TaskDescr::CopyToWorkDescr( + omptarget_nvptx_TaskDescr *masterTaskDescr, uint16_t tnum) { + CopyParent(masterTaskDescr); + // overrwrite specific items; + data.items.flags |= + TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel + data.items.threadsInTeam = tnum; // set number of threads +} + +INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr( + omptarget_nvptx_TaskDescr *workTaskDescr) { + Copy(workTaskDescr); + // + // overrwrite specific items; + // + // The threadID should be GetThreadIdInBlock() % GetMasterThreadID(). + // This is so that the serial master (first lane in the master warp) + // gets a threadId of 0. + // However, we know that this function is always called in a parallel + // region where only workers are active. The serial master thread + // never enters this region. When a parallel region is executed serially, + // the threadId is set to 0 elsewhere and the kmpc_serialized_* functions + // are called, which never activate this region. + data.items.threadId = + GetThreadIdInBlock(); // get ids from cuda (only called for 1st level) +} + +INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent( + omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum) { + CopyParent(parentTaskDescr); + data.items.flags |= TaskDescr_InParL2P; // In L2+ parallelism + data.items.threadsInTeam = tnum; // set number of threads + data.items.threadId = tid; +} + +//////////////////////////////////////////////////////////////////////////////// +// Thread Private Context +//////////////////////////////////////////////////////////////////////////////// + +INLINE omptarget_nvptx_TaskDescr * +omptarget_nvptx_ThreadPrivateContext::GetTopLevelTaskDescr(int tid) { + ASSERT0( + LT_FUSSY, tid < MAX_THREADS_PER_TEAM, + "Getting top level, tid is larger than allocated data structure size"); + return topTaskDescr[tid]; +} + +INLINE void +omptarget_nvptx_ThreadPrivateContext::InitThreadPrivateContext(int tid) { + // levelOneTaskDescr is init when starting the parallel region + // top task descr is NULL (team master version will be fixed separately) + topTaskDescr[tid] = NULL; + // no num threads value has been pushed + nextRegion.tnum[tid] = 0; + // priv counter init to zero + priv[tid] = 0; + // the following don't need to be init here; they are init when using dyn + // sched + // current_Event, events_Number, chunk, num_Iterations, schedule +} + +//////////////////////////////////////////////////////////////////////////////// +// Work Descriptor +//////////////////////////////////////////////////////////////////////////////// + +INLINE void omptarget_nvptx_WorkDescr::InitWorkDescr() { + cg.Clear(); // start and stop to zero too + // threadsInParallelTeam does not need to be init (done in start parallel) + hasCancel = FALSE; +} + +//////////////////////////////////////////////////////////////////////////////// +// Team Descriptor +//////////////////////////////////////////////////////////////////////////////// + +INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr() { + levelZeroTaskDescr.InitLevelZeroTaskDescr(); + workDescrForActiveParallel.InitWorkDescr(); + // omp_init_lock(criticalLock); +} + +//////////////////////////////////////////////////////////////////////////////// +// Get private data structure for thread +//////////////////////////////////////////////////////////////////////////////// + +// Utility routines for CUDA threads +INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor() { + return omptarget_nvptx_threadPrivateContext->TeamContext(); +} + +INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor() { + omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor(); + return currTeamDescr.WorkDescr(); +} + +INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int threadId) { + return omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); +} + +INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor() { + return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock()); +} Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/option.h =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/option.h +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/option.h @@ -0,0 +1,66 @@ +//===------------ option.h - NVPTX OpenMP GPU options ------------ CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// GPU default options +// +//===----------------------------------------------------------------------===// +#ifndef _OPTION_H_ +#define _OPTION_H_ + +//////////////////////////////////////////////////////////////////////////////// +// Kernel options +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// The following def must match the absolute limit hardwired in the host RTL +// max number of threads per team +#define MAX_THREADS_PER_TEAM 1024 + +#define WARPSIZE 32 + +// The named barrier for active parallel threads of a team in an L1 parallel +// region to synchronize with each other. +#define L1_BARRIER (1) + +// Maximum number of omp state objects per SM allocated statically in global +// memory. +#if __CUDA_ARCH__ >= 600 +#define OMP_STATE_COUNT 32 +#define MAX_SM 56 +#else +#define OMP_STATE_COUNT 16 +#define MAX_SM 16 +#endif + +//////////////////////////////////////////////////////////////////////////////// +// algo options +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// data options +//////////////////////////////////////////////////////////////////////////////// + +// decide if counters are 32 or 64 bit +#define Counter unsigned long long + +//////////////////////////////////////////////////////////////////////////////// +// misc options (by def everythig here is device) +//////////////////////////////////////////////////////////////////////////////// + +#define EXTERN extern "C" __device__ +#define INLINE __inline__ __device__ +#define NOINLINE __noinline__ __device__ +#ifndef TRUE +#define TRUE 1 +#endif +#ifndef FALSE +#define FALSE 0 +#endif + +#endif Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -0,0 +1,476 @@ +//===---- parallel.cu - NVPTX OpenMP parallel implementation ----- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Parallel implemention in the GPU. Here is the pattern: +// +// while (not finished) { +// +// if (master) { +// sequential code, decide which par loop to do, or if finished +// __kmpc_kernel_prepare_parallel() // exec by master only +// } +// syncthreads // A +// __kmpc_kernel_parallel() // exec by all +// if (this thread is included in the parallel) { +// switch () for all parallel loops +// __kmpc_kernel_end_parallel() // exec only by threads in parallel +// } +// +// +// The reason we don't exec end_parallel for the threads not included +// in the parallel loop is that for each barrier in the parallel +// region, these non-included threads will cycle through the +// syncthread A. Thus they must preserve their current threadId that +// is larger than thread in team. +// +// To make a long story short... +// +//===----------------------------------------------------------------------===// + +#include "omptarget-nvptx.h" + +typedef struct ConvergentSimdJob { + omptarget_nvptx_TaskDescr taskDescr; + omptarget_nvptx_TaskDescr *convHeadTaskDescr; + uint16_t slimForNextSimd; +} ConvergentSimdJob; + +//////////////////////////////////////////////////////////////////////////////// +// support for convergent simd (team of threads in a warp only) +//////////////////////////////////////////////////////////////////////////////// +EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask, + bool *IsFinal, int32_t *LaneSource, + int32_t *LaneId, int32_t *NumLanes) { + PRINT0(LD_IO, "call to __kmpc_kernel_convergent_simd\n"); + uint32_t ConvergentMask = Mask; + int32_t ConvergentSize = __popc(ConvergentMask); + uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1); + *LaneSource += __ffs(WorkRemaining); + *IsFinal = __popc(WorkRemaining) == 1; + uint32_t lanemask_lt; + asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt)); + *LaneId = __popc(ConvergentMask & lanemask_lt); + + int threadId = GetLogicalThreadIdInBlock(); + int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource; + + ConvergentSimdJob *job = (ConvergentSimdJob *)buffer; + int32_t SimdLimit = + omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId); + job->slimForNextSimd = SimdLimit; + + int32_t SimdLimitSource = __SHFL_SYNC(Mask, SimdLimit, *LaneSource); + // reset simdlimit to avoid propagating to successive #simd + if (SimdLimitSource > 0 && threadId == sourceThreadId) + omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = 0; + + // We cannot have more than the # of convergent threads. + if (SimdLimitSource > 0) + *NumLanes = min(ConvergentSize, SimdLimitSource); + else + *NumLanes = ConvergentSize; + ASSERT(LT_FUSSY, *NumLanes > 0, "bad thread request of %d threads", + *NumLanes); + + // Set to true for lanes participating in the simd region. + bool isActive = false; + // Initialize state for active threads. + if (*LaneId < *NumLanes) { + omptarget_nvptx_TaskDescr *currTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); + omptarget_nvptx_TaskDescr *sourceTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr( + sourceThreadId); + job->convHeadTaskDescr = currTaskDescr; + // install top descriptor from the thread for which the lanes are working. + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, + sourceTaskDescr); + isActive = true; + } + + // requires a memory fence between threads of a warp + return isActive; +} + +EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer) { + PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n"); + // pop stack + int threadId = GetLogicalThreadIdInBlock(); + ConvergentSimdJob *job = (ConvergentSimdJob *)buffer; + omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = + job->slimForNextSimd; + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( + threadId, job->convHeadTaskDescr); +} + +typedef struct ConvergentParallelJob { + omptarget_nvptx_TaskDescr taskDescr; + omptarget_nvptx_TaskDescr *convHeadTaskDescr; + uint16_t tnumForNextPar; +} ConvergentParallelJob; + +//////////////////////////////////////////////////////////////////////////////// +// support for convergent parallelism (team of threads in a warp only) +//////////////////////////////////////////////////////////////////////////////// +EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask, + bool *IsFinal, + int32_t *LaneSource) { + PRINT0(LD_IO, "call to __kmpc_kernel_convergent_parallel\n"); + uint32_t ConvergentMask = Mask; + int32_t ConvergentSize = __popc(ConvergentMask); + uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1); + *LaneSource += __ffs(WorkRemaining); + *IsFinal = __popc(WorkRemaining) == 1; + uint32_t lanemask_lt; + asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt)); + uint32_t OmpId = __popc(ConvergentMask & lanemask_lt); + + int threadId = GetLogicalThreadIdInBlock(); + int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource; + + ConvergentParallelJob *job = (ConvergentParallelJob *)buffer; + int32_t NumThreadsClause = + omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId); + job->tnumForNextPar = NumThreadsClause; + + int32_t NumThreadsSource = __SHFL_SYNC(Mask, NumThreadsClause, *LaneSource); + // reset numthreads to avoid propagating to successive #parallel + if (NumThreadsSource > 0 && threadId == sourceThreadId) + omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) = + 0; + + // We cannot have more than the # of convergent threads. + uint16_t NumThreads; + if (NumThreadsSource > 0) + NumThreads = min(ConvergentSize, NumThreadsSource); + else + NumThreads = ConvergentSize; + ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads", + NumThreads); + + // Set to true for workers participating in the parallel region. + bool isActive = false; + // Initialize state for active threads. + if (OmpId < NumThreads) { + // init L2 task descriptor and storage for the L1 parallel task descriptor. + omptarget_nvptx_TaskDescr *newTaskDescr = &job->taskDescr; + ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr"); + omptarget_nvptx_TaskDescr *currTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); + omptarget_nvptx_TaskDescr *sourceTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr( + sourceThreadId); + job->convHeadTaskDescr = currTaskDescr; + newTaskDescr->CopyConvergentParent(sourceTaskDescr, OmpId, NumThreads); + // install new top descriptor + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, + newTaskDescr); + isActive = true; + } + + // requires a memory fence between threads of a warp + return isActive; +} + +EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) { + PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n"); + // pop stack + int threadId = GetLogicalThreadIdInBlock(); + ConvergentParallelJob *job = (ConvergentParallelJob *)buffer; + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( + threadId, job->convHeadTaskDescr); + omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) = + job->tnumForNextPar; +} + +//////////////////////////////////////////////////////////////////////////////// +// support for parallel that goes parallel (1 static level only) +//////////////////////////////////////////////////////////////////////////////// + +// return number of cuda threads that participate to parallel +// calculation has to consider simd implementation in nvptx +// i.e. (num omp threads * num lanes) +// +// cudathreads = +// if(num_threads != 0) { +// if(thread_limit > 0) { +// min (num_threads*numLanes ; thread_limit*numLanes); +// } else { +// min (num_threads*numLanes; blockDim.x) +// } +// } else { +// if (thread_limit != 0) { +// min (thread_limit*numLanes; blockDim.x) +// } else { // no thread_limit, no num_threads, use all cuda threads +// blockDim.x; +// } +// } +// +// This routine is always called by the team master.. +EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, + int16_t IsOMPRuntimeInitialized) { + PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n"); + omptarget_nvptx_workFn = WorkFn; + + if (!IsOMPRuntimeInitialized) + return; + + // This routine is only called by the team master. The team master is + // the first thread of the last warp. It always has the logical thread + // id of 0 (since it is a shadow for the first worker thread). + int threadId = 0; + omptarget_nvptx_TaskDescr *currTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); + ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr"); + ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(), + "cannot be called in a parallel region."); + if (currTaskDescr->InParallelRegion()) { + PRINT0(LD_PAR, "already in parallel: go seq\n"); + return; + } + + uint16_t CudaThreadsForParallel = 0; + uint16_t NumThreadsClause = + omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId); + + // we cannot have more than block size + uint16_t CudaThreadsAvail = GetNumberOfWorkersInTeam(); + + // currTaskDescr->ThreadLimit(): If non-zero, this is the limit as + // specified by the thread_limit clause on the target directive. + // GetNumberOfWorkersInTeam(): This is the number of workers available + // in this kernel instance. + // + // E.g: If thread_limit is 33, the kernel is launched with 33+32=65 + // threads. The last warp is the master warp so in this case + // GetNumberOfWorkersInTeam() returns 64. + + // this is different from ThreadAvail of OpenMP because we may be + // using some of the CUDA threads as SIMD lanes + int NumLanes = 1; + if (NumThreadsClause != 0) { + // reset request to avoid propagating to successive #parallel + omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) = + 0; + + // assume that thread_limit*numlanes is already <= CudaThreadsAvail + // because that is already checked on the host side (CUDA offloading rtl) + if (currTaskDescr->ThreadLimit() != 0) + CudaThreadsForParallel = + NumThreadsClause * NumLanes < currTaskDescr->ThreadLimit() * NumLanes + ? NumThreadsClause * NumLanes + : currTaskDescr->ThreadLimit() * NumLanes; + else { + CudaThreadsForParallel = (NumThreadsClause * NumLanes > CudaThreadsAvail) + ? CudaThreadsAvail + : NumThreadsClause * NumLanes; + } + } else { + if (currTaskDescr->ThreadLimit() != 0) { + CudaThreadsForParallel = + (currTaskDescr->ThreadLimit() * NumLanes > CudaThreadsAvail) + ? CudaThreadsAvail + : currTaskDescr->ThreadLimit() * NumLanes; + } else + CudaThreadsForParallel = CudaThreadsAvail; + } + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + // On Volta and newer architectures we require that all lanes in + // a warp participate in the parallel region. Round down to a + // multiple of WARPSIZE since it is legal to do so in OpenMP. + // CudaThreadsAvail is the number of workers available in this + // kernel instance and is greater than or equal to + // currTaskDescr->ThreadLimit(). + if (CudaThreadsForParallel < CudaThreadsAvail) { + CudaThreadsForParallel = + (CudaThreadsForParallel < WARPSIZE) + ? 1 + : CudaThreadsForParallel & ~((uint16_t)WARPSIZE - 1); + } +#endif + + ASSERT(LT_FUSSY, CudaThreadsForParallel > 0, + "bad thread request of %d threads", CudaThreadsForParallel); + ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(), + "only team master can create parallel"); + + // set number of threads on work descriptor + // this is different from the number of cuda threads required for the parallel + // region + omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); + workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr, + CudaThreadsForParallel / NumLanes); + // init counters (copy start to init) + workDescr.CounterGroup().Reset(); +} + +// All workers call this function. Deactivate those not needed. +// Fn - the outlined work function to execute. +// returns True if this thread is active, else False. +// +// Only the worker threads call this routine. +EXTERN bool __kmpc_kernel_parallel(void **WorkFn, + int16_t IsOMPRuntimeInitialized) { + PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n"); + + // Work function and arguments for L1 parallel region. + *WorkFn = omptarget_nvptx_workFn; + + if (!IsOMPRuntimeInitialized) + return true; + + // If this is the termination signal from the master, quit early. + if (!*WorkFn) + return false; + + // Only the worker threads call this routine and the master warp + // never arrives here. Therefore, use the nvptx thread id. + int threadId = GetThreadIdInBlock(); + omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); + // Set to true for workers participating in the parallel region. + bool isActive = false; + // Initialize state for active threads. + if (threadId < workDescr.WorkTaskDescr()->ThreadsInTeam()) { + // init work descriptor from workdesccr + omptarget_nvptx_TaskDescr *newTaskDescr = + omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId); + ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr"); + newTaskDescr->CopyFromWorkDescr(workDescr.WorkTaskDescr()); + // install new top descriptor + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, + newTaskDescr); + // init private from int value + workDescr.CounterGroup().Init( + omptarget_nvptx_threadPrivateContext->Priv(threadId)); + PRINT(LD_PAR, + "thread will execute parallel region with id %d in a team of " + "%d threads\n", + newTaskDescr->ThreadId(), newTaskDescr->NThreads()); + + isActive = true; + } + + return isActive; +} + +EXTERN void __kmpc_kernel_end_parallel() { + // pop stack + PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_parallel\n"); + // Only the worker threads call this routine and the master warp + // never arrives here. Therefore, use the nvptx thread id. + int threadId = GetThreadIdInBlock(); + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( + threadId, currTaskDescr->GetPrevTaskDescr()); +} + +//////////////////////////////////////////////////////////////////////////////// +// support for parallel that goes sequential +//////////////////////////////////////////////////////////////////////////////// + +EXTERN void __kmpc_serialized_parallel(kmp_Indent *loc, uint32_t global_tid) { + PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n"); + + // assume this is only called for nested parallel + int threadId = GetLogicalThreadIdInBlock(); + + // unlike actual parallel, threads in the same team do not share + // the workTaskDescr in this case and num threads is fixed to 1 + + // get current task + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); + + // allocate new task descriptor and copy value from current one, set prev to + // it + omptarget_nvptx_TaskDescr *newTaskDescr = + (omptarget_nvptx_TaskDescr *)SafeMalloc(sizeof(omptarget_nvptx_TaskDescr), + (char *)"new seq parallel task"); + newTaskDescr->CopyParent(currTaskDescr); + + // tweak values for serialized parallel case: + // - each thread becomes ID 0 in its serialized parallel, and + // - there is only one thread per team + newTaskDescr->ThreadId() = 0; + newTaskDescr->ThreadsInTeam() = 1; + + // set new task descriptor as top + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, + newTaskDescr); +} + +EXTERN void __kmpc_end_serialized_parallel(kmp_Indent *loc, + uint32_t global_tid) { + PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n"); + + // pop stack + int threadId = GetLogicalThreadIdInBlock(); + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); + // set new top + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( + threadId, currTaskDescr->GetPrevTaskDescr()); + // free + SafeFree(currTaskDescr, (char *)"new seq parallel task"); +} + +EXTERN uint16_t __kmpc_parallel_level(kmp_Indent *loc, uint32_t global_tid) { + PRINT0(LD_IO, "call to __kmpc_parallel_level\n"); + + int threadId = GetLogicalThreadIdInBlock(); + omptarget_nvptx_TaskDescr *currTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); + if (currTaskDescr->InL2OrHigherParallelRegion()) + return 2; + else if (currTaskDescr->InParallelRegion()) + return 1; + else + return 0; +} + +// This kmpc call returns the thread id across all teams. It's value is +// cached by the compiler and used when calling the runtime. On nvptx +// it's cheap to recalculate this value so we never use the result +// of this call. +EXTERN int32_t __kmpc_global_thread_num(kmp_Indent *loc) { + return GetLogicalThreadIdInBlock(); +} + +//////////////////////////////////////////////////////////////////////////////// +// push params +//////////////////////////////////////////////////////////////////////////////// + +EXTERN void __kmpc_push_num_threads(kmp_Indent *loc, int32_t tid, + int32_t num_threads) { + PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads); + tid = GetLogicalThreadIdInBlock(); + omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) = + num_threads; +} + +EXTERN void __kmpc_push_simd_limit(kmp_Indent *loc, int32_t tid, + int32_t simd_limit) { + PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", simd_limit); + tid = GetLogicalThreadIdInBlock(); + omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit; +} + +// Do nothing. The host guarantees we started the requested number of +// teams and we only need inspection of gridDim. + +EXTERN void __kmpc_push_num_teams(kmp_Indent *loc, int32_t tid, + int32_t num_teams, int32_t thread_limit) { + PRINT(LD_IO, "call kmpc_push_num_teams %d\n", num_teams); + ASSERT0(LT_FUSSY, FALSE, + "should never have anything with new teams on device"); +} + +EXTERN void __kmpc_push_proc_bind(kmp_Indent *loc, uint32_t tid, + int proc_bind) { + PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", proc_bind); +} Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu @@ -0,0 +1,443 @@ +//===---- reduction.cu - NVPTX OpenMP reduction implementation ---- CUDA +//-*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of reduction with KMPC interface. +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include "omptarget-nvptx.h" + +// may eventually remove this +EXTERN +int32_t __gpu_block_reduce() { + int tid = GetLogicalThreadIdInBlock(); + int nt = GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()); + if (nt != blockDim.x) + return 0; + unsigned tnum = __ACTIVEMASK(); + if (tnum != (~0x0)) { // assume swapSize is 32 + return 0; + } + return 1; +} + +EXTERN +int32_t __kmpc_reduce_gpu(kmp_Indent *loc, int32_t global_tid, int32_t num_vars, + size_t reduce_size, void *reduce_data, + void *reduce_array_size, kmp_ReductFctPtr *reductFct, + kmp_CriticalName *lck) { + int threadId = GetLogicalThreadIdInBlock(); + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); + int numthread; + if (currTaskDescr->IsParallelConstruct()) { + numthread = + GetNumberOfOmpThreads(threadId, isSPMDMode(), isRuntimeUninitialized()); + } else { + numthread = GetNumberOfOmpTeams(); + } + + if (numthread == 1) + return 1; + else if (!__gpu_block_reduce()) + return 2; + else { + if (threadIdx.x == 0) + return 1; + else + return 0; + } +} + +EXTERN +int32_t __kmpc_reduce_combined(kmp_Indent *loc) { + if (threadIdx.x == 0) { + return 2; + } else { + return 0; + } +} + +EXTERN +int32_t __kmpc_reduce_simd(kmp_Indent *loc) { + if (threadIdx.x % 32 == 0) { + return 1; + } else { + return 0; + } +} + +EXTERN +void __kmpc_nvptx_end_reduce(int32_t global_tid) {} + +EXTERN +void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {} + +EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) { + return __SHFL_DOWN_SYNC(0xFFFFFFFF, val, delta, size); +} + +EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) { + int lo, hi; + asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val)); + hi = __SHFL_DOWN_SYNC(0xFFFFFFFF, hi, delta, size); + lo = __SHFL_DOWN_SYNC(0xFFFFFFFF, lo, delta, size); + asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi)); + return val; +} + +static INLINE void gpu_regular_warp_reduce(void *reduce_data, + kmp_ShuffleReductFctPtr shflFct) { + for (uint32_t mask = WARPSIZE / 2; mask > 0; mask /= 2) { + shflFct(reduce_data, /*LaneId - not used= */ 0, + /*Offset = */ mask, /*AlgoVersion=*/0); + } +} + +static INLINE void gpu_irregular_warp_reduce(void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, + uint32_t size, uint32_t tid) { + uint32_t curr_size; + uint32_t mask; + curr_size = size; + mask = curr_size / 2; + while (mask > 0) { + shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1); + curr_size = (curr_size + 1) / 2; + mask = curr_size / 2; + } +} + +static INLINE uint32_t +gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) { + uint32_t lanemask_lt; + uint32_t lanemask_gt; + uint32_t size, remote_id, physical_lane_id; + physical_lane_id = GetThreadIdInBlock() % WARPSIZE; + asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt)); + uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true); + uint32_t logical_lane_id = __popc(Liveness & lanemask_lt) * 2; + asm("mov.u32 %0, %%lanemask_gt;" : "=r"(lanemask_gt)); + do { + Liveness = __BALLOT_SYNC(0xFFFFFFFF, true); + remote_id = __ffs(Liveness & lanemask_gt); + size = __popc(Liveness); + logical_lane_id /= 2; + shflFct(reduce_data, /*LaneId =*/logical_lane_id, + /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2); + } while (logical_lane_id % 2 == 0 && size > 1); + return (logical_lane_id == 0); +} + +EXTERN +int32_t __kmpc_nvptx_simd_reduce_nowait(int32_t global_tid, int32_t num_vars, + size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, + kmp_InterWarpCopyFctPtr cpyFct) { + uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true); + if (Liveness == 0xffffffff) { + gpu_regular_warp_reduce(reduce_data, shflFct); + return GetThreadIdInBlock() % WARPSIZE == + 0; // Result on lane 0 of the simd warp. + } else { + return gpu_irregular_simd_reduce( + reduce_data, shflFct); // Result on the first active lane. + } +} + +INLINE +int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars, + size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, + kmp_InterWarpCopyFctPtr cpyFct, + bool isSPMDExecutionMode, + bool isRuntimeUninitialized = false) { + /* + * This reduce function handles reduction within a team. It handles + * parallel regions in both L1 and L2 parallelism levels. It also + * supports Generic, SPMD, and NoOMP modes. + * + * 1. Reduce within a warp. + * 2. Warp master copies value to warp 0 via shared memory. + * 3. Warp 0 reduces to a single value. + * 4. The reduced value is available in the thread that returns 1. + */ + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + uint32_t BlockThreadId = GetLogicalThreadIdInBlock(); + uint32_t NumThreads = GetNumberOfOmpThreads( + BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized); + uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE; + uint32_t WarpId = BlockThreadId / WARPSIZE; + + // Volta execution model: + // For the Generic execution mode a parallel region either has 1 thread and + // beyond that, always a multiple of 32. For the SPMD execution mode we may + // have any number of threads. + if ((NumThreads % WARPSIZE == 0) || (WarpId < WarpsNeeded - 1)) + gpu_regular_warp_reduce(reduce_data, shflFct); + else if (NumThreads > 1) // Only SPMD execution mode comes thru this case. + gpu_irregular_warp_reduce(reduce_data, shflFct, + /*LaneCount=*/NumThreads % WARPSIZE, + /*LaneId=*/GetThreadIdInBlock() % WARPSIZE); + + // When we have more than [warpsize] number of threads + // a block reduction is performed here. + // + // Only L1 parallel region can enter this if condition. + if (NumThreads > WARPSIZE) { + // Gather all the reduced values from each warp + // to the first warp. + cpyFct(reduce_data, WarpsNeeded); + + if (WarpId == 0) + gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, + BlockThreadId); + + return BlockThreadId == 0; + } + return BlockThreadId == 0; +#else + uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true); + if (Liveness == 0xffffffff) // Full warp + gpu_regular_warp_reduce(reduce_data, shflFct); + else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes + gpu_irregular_warp_reduce(reduce_data, shflFct, + /*LaneCount=*/__popc(Liveness), + /*LaneId=*/GetThreadIdInBlock() % WARPSIZE); + else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2 + // parallel region may enter here; return + // early. + return gpu_irregular_simd_reduce(reduce_data, shflFct); + + uint32_t BlockThreadId = GetLogicalThreadIdInBlock(); + uint32_t NumThreads = GetNumberOfOmpThreads( + BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized); + + // When we have more than [warpsize] number of threads + // a block reduction is performed here. + // + // Only L1 parallel region can enter this if condition. + if (NumThreads > WARPSIZE) { + uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE; + // Gather all the reduced values from each warp + // to the first warp. + cpyFct(reduce_data, WarpsNeeded); + + uint32_t WarpId = BlockThreadId / WARPSIZE; + if (WarpId == 0) + gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, + BlockThreadId); + + return BlockThreadId == 0; + } else if (isRuntimeUninitialized /* Never an L2 parallel region without the OMP runtime */) { + return BlockThreadId == 0; + } + + // Get the OMP thread Id. This is different from BlockThreadId in the case of + // an L2 parallel region. + return GetOmpThreadId(BlockThreadId, isSPMDExecutionMode, + isRuntimeUninitialized) == 0; +#endif // __CUDA_ARCH__ >= 700 +} + +EXTERN +int32_t __kmpc_nvptx_parallel_reduce_nowait( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) { + return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size, + reduce_data, shflFct, cpyFct, + /*isSPMDExecutionMode=*/isSPMDMode()); +} + +EXTERN +int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) { + return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size, + reduce_data, shflFct, cpyFct, + /*isSPMDExecutionMode=*/true, + /*isRuntimeUninitialized=*/true); +} + +EXTERN +int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) { + return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size, + reduce_data, shflFct, cpyFct, + /*isSPMDExecutionMode=*/false, + /*isRuntimeUninitialized=*/true); +} + +INLINE +int32_t nvptx_teams_reduce_nowait( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, + kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct, + bool isSPMDExecutionMode, bool isRuntimeUninitialized = false) { + uint32_t ThreadId = GetLogicalThreadIdInBlock(); + // In non-generic mode all workers participate in the teams reduction. + // In generic mode only the team master participates in the teams + // reduction because the workers are waiting for parallel work. + uint32_t NumThreads = + isSPMDExecutionMode + ? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true, + isRuntimeUninitialized) + : /*Master thread only*/ 1; + uint32_t TeamId = GetBlockIdInKernel(); + uint32_t NumTeams = GetNumberOfBlocksInKernel(); + __shared__ volatile bool IsLastTeam; + + // Team masters of all teams write to the scratchpad. + if (ThreadId == 0) { + unsigned int *timestamp = GetTeamsReductionTimestamp(); + char *scratchpad = GetTeamsReductionScratchpad(); + + scratchFct(reduce_data, scratchpad, TeamId, NumTeams); + __threadfence(); + + // atomicInc increments 'timestamp' and has a range [0, NumTeams-1]. + // It resets 'timestamp' back to 0 once the last team increments + // this counter. + unsigned val = atomicInc(timestamp, NumTeams - 1); + IsLastTeam = val == NumTeams - 1; + } + + // We have to wait on L1 barrier because in GENERIC mode the workers + // are waiting on barrier 0 for work. + // + // If we guard this barrier as follows it leads to deadlock, probably + // because of a compiler bug: if (!IsGenericMode()) __syncthreads(); + uint16_t SyncWarps = (NumThreads + WARPSIZE - 1) / WARPSIZE; + named_sync(L1_BARRIER, SyncWarps * WARPSIZE); + + // If this team is not the last, quit. + if (/* Volatile read by all threads */ !IsLastTeam) + return 0; + + // + // Last team processing. + // + + // Threads in excess of #teams do not participate in reduction of the + // scratchpad values. +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + uint32_t ActiveThreads = NumThreads; + if (NumTeams < NumThreads) { + ActiveThreads = + (NumTeams < WARPSIZE) ? 1 : NumTeams & ~((uint16_t)WARPSIZE - 1); + } + if (ThreadId >= ActiveThreads) + return 0; + + // Load from scratchpad and reduce. + char *scratchpad = GetTeamsReductionScratchpad(); + ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0); + for (uint32_t i = ActiveThreads + ThreadId; i < NumTeams; i += ActiveThreads) + ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1); + + uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE; + uint32_t WarpId = ThreadId / WARPSIZE; + + // Reduce across warps to the warp master. + if ((ActiveThreads % WARPSIZE == 0) || + (WarpId < WarpsNeeded - 1)) // Full warp + gpu_regular_warp_reduce(reduce_data, shflFct); + else if (ActiveThreads > 1) // Partial warp but contiguous lanes + // Only SPMD execution mode comes thru this case. + gpu_irregular_warp_reduce(reduce_data, shflFct, + /*LaneCount=*/ActiveThreads % WARPSIZE, + /*LaneId=*/ThreadId % WARPSIZE); + + // When we have more than [warpsize] number of threads + // a block reduction is performed here. + if (ActiveThreads > WARPSIZE) { + // Gather all the reduced values from each warp + // to the first warp. + cpyFct(reduce_data, WarpsNeeded); + + if (WarpId == 0) + gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId); + } +#else + if (ThreadId >= NumTeams) + return 0; + + // Load from scratchpad and reduce. + char *scratchpad = GetTeamsReductionScratchpad(); + ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0); + for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads) + ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1); + + // Reduce across warps to the warp master. + uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true); + if (Liveness == 0xffffffff) // Full warp + gpu_regular_warp_reduce(reduce_data, shflFct); + else // Partial warp but contiguous lanes + gpu_irregular_warp_reduce(reduce_data, shflFct, + /*LaneCount=*/__popc(Liveness), + /*LaneId=*/ThreadId % WARPSIZE); + + // When we have more than [warpsize] number of threads + // a block reduction is performed here. + uint32_t ActiveThreads = NumTeams < NumThreads ? NumTeams : NumThreads; + if (ActiveThreads > WARPSIZE) { + uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE; + // Gather all the reduced values from each warp + // to the first warp. + cpyFct(reduce_data, WarpsNeeded); + + uint32_t WarpId = ThreadId / WARPSIZE; + if (WarpId == 0) + gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId); + } +#endif // __CUDA_ARCH__ >= 700 + + return ThreadId == 0; +} + +EXTERN +int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars, + size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, + kmp_InterWarpCopyFctPtr cpyFct, + kmp_CopyToScratchpadFctPtr scratchFct, + kmp_LoadReduceFctPtr ldFct) { + return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size, + reduce_data, shflFct, cpyFct, scratchFct, + ldFct, /*isSPMDExecutionMode=*/isSPMDMode()); +} + +EXTERN +int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, + kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) { + return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size, + reduce_data, shflFct, cpyFct, scratchFct, + ldFct, + /*isSPMDExecutionMode=*/true, + /*isRuntimeUninitialized=*/true); +} + +EXTERN +int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, + kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) { + return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size, + reduce_data, shflFct, cpyFct, scratchFct, + ldFct, + /*isSPMDExecutionMode=*/false, + /*isRuntimeUninitialized=*/true); +} Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/state-queue.h =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/state-queue.h +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/state-queue.h @@ -0,0 +1,52 @@ +//===--------- statequeue.h - NVPTX OpenMP GPU State Queue ------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a queue to hand out OpenMP state objects to teams of +// one or more kernels. +// +// Reference: +// Thomas R.W. Scogland and Wu-chun Feng. 2015. +// Design and Evaluation of Scalable Concurrent Queues for Many-Core +// Architectures. International Conference on Performance Engineering. +// +//===----------------------------------------------------------------------===// + +#ifndef __STATE_QUEUE_H +#define __STATE_QUEUE_H + +#include + +#include "option.h" // choices we have + +template class omptarget_nvptx_Queue { +private: + ElementType elements[SIZE]; + volatile ElementType *elementQueue[SIZE]; + volatile uint32_t head; + volatile uint32_t ids[SIZE]; + volatile uint32_t tail; + + static const uint32_t MAX_ID = (1u << 31) / SIZE / 2; + INLINE uint32_t ENQUEUE_TICKET(); + INLINE uint32_t DEQUEUE_TICKET(); + INLINE uint32_t ID(uint32_t ticket); + INLINE bool IsServing(uint32_t slot, uint32_t id); + INLINE void PushElement(uint32_t slot, ElementType *element); + INLINE ElementType *PopElement(uint32_t slot); + INLINE void DoneServing(uint32_t slot, uint32_t id); + +public: + INLINE omptarget_nvptx_Queue(){}; + INLINE void Enqueue(ElementType *element); + INLINE ElementType *Dequeue(); +}; + +#include "state-queuei.h" + +#endif Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/state-queuei.h =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/state-queuei.h +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/state-queuei.h @@ -0,0 +1,89 @@ +//===------- state-queue.cu - NVPTX OpenMP GPU State Queue ------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of a queue to hand out OpenMP state +// objects to teams of one or more kernels. +// +// Reference: +// Thomas R.W. Scogland and Wu-chun Feng. 2015. +// Design and Evaluation of Scalable Concurrent Queues for Many-Core +// Architectures. International Conference on Performance Engineering. +// +//===----------------------------------------------------------------------===// + +#include "state-queue.h" + +template +INLINE uint32_t omptarget_nvptx_Queue::ENQUEUE_TICKET() { + return atomicAdd((unsigned int *)&tail, 1); +} + +template +INLINE uint32_t omptarget_nvptx_Queue::DEQUEUE_TICKET() { + return atomicAdd((unsigned int *)&head, 1); +} + +template +INLINE uint32_t omptarget_nvptx_Queue::ID(uint32_t ticket) { + return (ticket / SIZE) * 2; +} + +template +INLINE bool omptarget_nvptx_Queue::IsServing(uint32_t slot, + uint32_t id) { + return atomicAdd((unsigned int *)&ids[slot], 0) == id; +} + +template +INLINE void +omptarget_nvptx_Queue::PushElement(uint32_t slot, + ElementType *element) { + atomicExch((unsigned long long *)&elementQueue[slot], + (unsigned long long)element); +} + +template +INLINE ElementType * +omptarget_nvptx_Queue::PopElement(uint32_t slot) { + return (ElementType *)atomicAdd((unsigned long long *)&elementQueue[slot], + (unsigned long long)0); +} + +template +INLINE void omptarget_nvptx_Queue::DoneServing(uint32_t slot, + uint32_t id) { + atomicExch((unsigned int *)&ids[slot], (id + 1) % MAX_ID); +} + +template +INLINE void +omptarget_nvptx_Queue::Enqueue(ElementType *element) { + uint32_t ticket = ENQUEUE_TICKET(); + uint32_t slot = ticket % SIZE; + uint32_t id = ID(ticket) + 1; + while (!IsServing(slot, id)) + ; + PushElement(slot, element); + DoneServing(slot, id); +} + +template +INLINE ElementType *omptarget_nvptx_Queue::Dequeue() { + uint32_t ticket = DEQUEUE_TICKET(); + uint32_t slot = ticket % SIZE; + uint32_t id = ID(ticket); + while (!IsServing(slot, id)) + ; + ElementType *element = PopElement(slot); + // This is to populate the queue because of the lack of GPU constructors. + if (element == 0) + element = &elements[slot]; + DoneServing(slot, id); + return element; +} Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/support.h =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/support.h +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/support.h @@ -0,0 +1,92 @@ +//===--------- support.h - NVPTX OpenMP support functions -------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Wrapper to some functions natively supported by the GPU. +// +//===----------------------------------------------------------------------===// + +//////////////////////////////////////////////////////////////////////////////// +// Execution Parameters +//////////////////////////////////////////////////////////////////////////////// +enum ExecutionMode { + Generic = 0x00u, + Spmd = 0x01u, + ModeMask = 0x01u, +}; + +enum RuntimeMode { + RuntimeInitialized = 0x00u, + RuntimeUninitialized = 0x02u, + RuntimeMask = 0x02u, +}; + +INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode); +INLINE bool isGenericMode(); +INLINE bool isSPMDMode(); +INLINE bool isRuntimeUninitialized(); +INLINE bool isRuntimeInitialized(); + +//////////////////////////////////////////////////////////////////////////////// +// get info from machine +//////////////////////////////////////////////////////////////////////////////// + +// get low level ids of resources +INLINE int GetThreadIdInBlock(); +INLINE int GetBlockIdInKernel(); +INLINE int GetNumberOfBlocksInKernel(); +INLINE int GetNumberOfThreadsInBlock(); + +// get global ids to locate tread/team info (constant regardless of OMP) +INLINE int GetLogicalThreadIdInBlock(); +INLINE int GetMasterThreadID(); +INLINE int GetNumberOfWorkersInTeam(); + +// get OpenMP thread and team ids +INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode, + bool isRuntimeUninitialized); // omp_thread_num +INLINE int GetOmpTeamId(); // omp_team_num + +// get OpenMP number of threads and team +INLINE int +GetNumberOfOmpThreads(int threadId, bool isSPMDExecutionMode, + bool isRuntimeUninitialized); // omp_num_threads +INLINE int GetNumberOfOmpTeams(); // omp_num_teams + +// get OpenMP number of procs +INLINE int GetNumberOfProcsInTeam(); +INLINE int GetNumberOfProcsInDevice(); + +// masters +INLINE int IsTeamMaster(int ompThreadId); + +//////////////////////////////////////////////////////////////////////////////// +// Memory +//////////////////////////////////////////////////////////////////////////////// + +// safe alloc and free +INLINE void *SafeMalloc(size_t size, const char *msg); // check if success +INLINE void *SafeFree(void *ptr, const char *msg); +// pad to a alignment (power of 2 only) +INLINE unsigned long PadBytes(unsigned long size, unsigned long alignment); +#define ADD_BYTES(_addr, _bytes) \ + ((void *)((char *)((void *)(_addr)) + (_bytes))) +#define SUB_BYTES(_addr, _bytes) \ + ((void *)((char *)((void *)(_addr)) - (_bytes))) + +//////////////////////////////////////////////////////////////////////////////// +// Named Barrier Routines +//////////////////////////////////////////////////////////////////////////////// +INLINE void named_sync(const int barrier, const int num_threads); + +//////////////////////////////////////////////////////////////////////////////// +// Teams Reduction Scratchpad Helpers +//////////////////////////////////////////////////////////////////////////////// +INLINE unsigned int *GetTeamsReductionTimestamp(); +INLINE char *GetTeamsReductionScratchpad(); +INLINE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr); Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -0,0 +1,216 @@ +//===--------- supporti.h - NVPTX OpenMP support functions ------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Wrapper implementation to some functions natively supported by the GPU. +// +//===----------------------------------------------------------------------===// + +//////////////////////////////////////////////////////////////////////////////// +// Execution Parameters +//////////////////////////////////////////////////////////////////////////////// + +INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) { + execution_param = EMode; + execution_param |= RMode; +} + +INLINE bool isGenericMode() { return (execution_param & ModeMask) == Generic; } + +INLINE bool isSPMDMode() { return (execution_param & ModeMask) == Spmd; } + +INLINE bool isRuntimeUninitialized() { + return (execution_param & RuntimeMask) == RuntimeUninitialized; +} + +INLINE bool isRuntimeInitialized() { + return (execution_param & RuntimeMask) == RuntimeInitialized; +} + +//////////////////////////////////////////////////////////////////////////////// +// support: get info from machine +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// +// Calls to the NVPTX layer (assuming 1D layout) +// +//////////////////////////////////////////////////////////////////////////////// + +INLINE int GetThreadIdInBlock() { return threadIdx.x; } + +INLINE int GetBlockIdInKernel() { return blockIdx.x; } + +INLINE int GetNumberOfBlocksInKernel() { return gridDim.x; } + +INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; } + +//////////////////////////////////////////////////////////////////////////////// +// +// Calls to the Generic Scheme Implementation Layer (assuming 1D layout) +// +//////////////////////////////////////////////////////////////////////////////// + +// The master thread id is the first thread (lane) of the last warp. +// Thread id is 0 indexed. +// E.g: If NumThreads is 33, master id is 32. +// If NumThreads is 64, master id is 32. +// If NumThreads is 97, master id is 96. +// If NumThreads is 1024, master id is 992. +// +// Called in Generic Execution Mode only. +INLINE int GetMasterThreadID() { return (blockDim.x - 1) & ~(WARPSIZE - 1); } + +// The last warp is reserved for the master; other warps are workers. +// Called in Generic Execution Mode only. +INLINE int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); } + +//////////////////////////////////////////////////////////////////////////////// +// get thread id in team + +// This function may be called in a parallel region by the workers +// or a serial region by the master. If the master (whose CUDA thread +// id is GetMasterThreadID()) calls this routine, we return 0 because +// it is a shadow for the first worker. +INLINE int GetLogicalThreadIdInBlock() { + // return GetThreadIdInBlock() % GetMasterThreadID(); + + // Implemented using control flow (predication) instead of with a modulo + // operation. + int tid = GetThreadIdInBlock(); + if (isGenericMode() && tid >= GetMasterThreadID()) + return 0; + else + return tid; +} + +//////////////////////////////////////////////////////////////////////////////// +// +// OpenMP Thread Support Layer +// +//////////////////////////////////////////////////////////////////////////////// + +INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode, + bool isRuntimeUninitialized) { + // omp_thread_num + int rc; + + if (isRuntimeUninitialized) { + rc = GetThreadIdInBlock(); + if (!isSPMDExecutionMode && rc >= GetMasterThreadID()) + rc = 0; + } else { + omptarget_nvptx_TaskDescr *currTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); + rc = currTaskDescr->ThreadId(); + } + return rc; +} + +INLINE int GetNumberOfOmpThreads(int threadId, bool isSPMDExecutionMode, + bool isRuntimeUninitialized) { + // omp_num_threads + int rc; + + if (isRuntimeUninitialized) { + rc = isSPMDExecutionMode ? GetNumberOfThreadsInBlock() + : GetNumberOfThreadsInBlock() - WARPSIZE; + } else { + omptarget_nvptx_TaskDescr *currTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); + ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr"); + rc = currTaskDescr->ThreadsInTeam(); + } + + return rc; +} + +//////////////////////////////////////////////////////////////////////////////// +// Team id linked to OpenMP + +INLINE int GetOmpTeamId() { + // omp_team_num + return GetBlockIdInKernel(); // assume 1 block per team +} + +INLINE int GetNumberOfOmpTeams() { + // omp_num_teams + return GetNumberOfBlocksInKernel(); // assume 1 block per team +} + +//////////////////////////////////////////////////////////////////////////////// +// Masters + +INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); } + +//////////////////////////////////////////////////////////////////////////////// +// get OpenMP number of procs + +// Get the number of processors in the device. +INLINE int GetNumberOfProcsInDevice() { + if (isGenericMode()) + return GetNumberOfWorkersInTeam(); + else + return GetNumberOfThreadsInBlock(); +} + +INLINE int GetNumberOfProcsInTeam() { return GetNumberOfProcsInDevice(); } + +//////////////////////////////////////////////////////////////////////////////// +// Memory +//////////////////////////////////////////////////////////////////////////////// + +INLINE unsigned long PadBytes(unsigned long size, + unsigned long alignment) // must be a power of 2 +{ + // compute the necessary padding to satisfy alignment constraint + ASSERT(LT_FUSSY, (alignment & (alignment - 1)) == 0, + "alignment %ld is not a power of 2\n", alignment); + return (~(unsigned long)size + 1) & (alignment - 1); +} + +INLINE void *SafeMalloc(size_t size, const char *msg) // check if success +{ + void *ptr = malloc(size); + PRINT(LD_MEM, "malloc data of size %d for %s: 0x%llx\n", size, msg, P64(ptr)); + ASSERT(LT_SAFETY, ptr, "failed to allocate %d bytes for %s\n", size, msg); + return ptr; +} + +INLINE void *SafeFree(void *ptr, const char *msg) { + PRINT(LD_MEM, "free data ptr 0x%llx for %s\n", P64(ptr), msg); + free(ptr); + return NULL; +} + +//////////////////////////////////////////////////////////////////////////////// +// Named Barrier Routines +//////////////////////////////////////////////////////////////////////////////// + +INLINE void named_sync(const int barrier, const int num_threads) { + asm volatile("bar.sync %0, %1;" + : + : "r"(barrier), "r"(num_threads) + : "memory"); +} + +//////////////////////////////////////////////////////////////////////////////// +// Teams Reduction Scratchpad Helpers +//////////////////////////////////////////////////////////////////////////////// + +INLINE unsigned int *GetTeamsReductionTimestamp() { + return static_cast(ReductionScratchpadPtr); +} + +INLINE char *GetTeamsReductionScratchpad() { + return static_cast(ReductionScratchpadPtr) + 256; +} + +INLINE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr) { + ReductionScratchpadPtr = ScratchpadPtr; +} Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu @@ -0,0 +1,153 @@ +//===------------ sync.h - NVPTX OpenMP synchronizations --------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Include all synchronization. +// +//===----------------------------------------------------------------------===// + +#include "omptarget-nvptx.h" + +//////////////////////////////////////////////////////////////////////////////// +// KMP Ordered calls +//////////////////////////////////////////////////////////////////////////////// + +EXTERN void __kmpc_ordered(kmp_Indent *loc, int32_t tid) { + PRINT0(LD_IO, "call kmpc_ordered\n"); +} + +EXTERN void __kmpc_end_ordered(kmp_Indent *loc, int32_t tid) { + PRINT0(LD_IO, "call kmpc_end_ordered\n"); +} + +//////////////////////////////////////////////////////////////////////////////// +// KMP Barriers +//////////////////////////////////////////////////////////////////////////////// + +// a team is a block: we can use CUDA native synchronization mechanism +// FIXME: what if not all threads (warps) participate to the barrier? +// We may need to implement it differently + +EXTERN int32_t __kmpc_cancel_barrier(kmp_Indent *loc_ref, int32_t tid) { + PRINT0(LD_IO, "call kmpc_cancel_barrier\n"); + __syncthreads(); + PRINT0(LD_SYNC, "completed kmpc_cancel_barrier\n"); + return 0; +} + +EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) { + tid = GetLogicalThreadIdInBlock(); + omptarget_nvptx_TaskDescr *currTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid); + if (!currTaskDescr->InL2OrHigherParallelRegion()) { + int numberOfActiveOMPThreads = + GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()); +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + // On Volta and newer architectures we require that all lanes in + // a warp (at least, all present for the kernel launch) participate in the + // barrier. This is enforced when launching the parallel region. An + // exception is when there are < WARPSIZE workers. In this case only 1 + // worker is started, so we don't need a barrier. + if (numberOfActiveOMPThreads > 1) { +#endif + // The #threads parameter must be rounded up to the WARPSIZE. + int threads = + WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE); + + PRINT(LD_SYNC, + "call kmpc_barrier with %d omp threads, sync parameter %d\n", + numberOfActiveOMPThreads, threads); + // Barrier #1 is for synchronization among active threads. + named_sync(L1_BARRIER, threads); +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + } // numberOfActiveOMPThreads > 1 +#endif + } + PRINT0(LD_SYNC, "completed kmpc_barrier\n"); +} + +// Emit a simple barrier call in SPMD mode. Assumes the caller is in an L0 +// parallel region and that all worker threads participate. +EXTERN void __kmpc_barrier_simple_spmd(kmp_Indent *loc_ref, int32_t tid) { + PRINT0(LD_SYNC, "call kmpc_barrier_simple_spmd\n"); + __syncthreads(); + PRINT0(LD_SYNC, "completed kmpc_barrier_simple_spmd\n"); +} + +// Emit a simple barrier call in Generic mode. Assumes the caller is in an L0 +// parallel region and that all worker threads participate. +EXTERN void __kmpc_barrier_simple_generic(kmp_Indent *loc_ref, int32_t tid) { + int numberOfActiveOMPThreads = GetNumberOfThreadsInBlock() - WARPSIZE; + // The #threads parameter must be rounded up to the WARPSIZE. + int threads = + WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE); + + PRINT(LD_SYNC, + "call kmpc_barrier_simple_generic with %d omp threads, sync parameter " + "%d\n", + numberOfActiveOMPThreads, threads); + // Barrier #1 is for synchronization among active threads. + named_sync(L1_BARRIER, threads); + PRINT0(LD_SYNC, "completed kmpc_barrier_simple_generic\n"); +} + +//////////////////////////////////////////////////////////////////////////////// +// KMP MASTER +//////////////////////////////////////////////////////////////////////////////// + +INLINE int32_t IsMaster() { + // only the team master updates the state + int tid = GetLogicalThreadIdInBlock(); + int ompThreadId = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()); + return IsTeamMaster(ompThreadId); +} + +EXTERN int32_t __kmpc_master(kmp_Indent *loc, int32_t global_tid) { + PRINT0(LD_IO, "call kmpc_master\n"); + return IsMaster(); +} + +EXTERN void __kmpc_end_master(kmp_Indent *loc, int32_t global_tid) { + PRINT0(LD_IO, "call kmpc_end_master\n"); + ASSERT0(LT_FUSSY, IsMaster(), "expected only master here"); +} + +//////////////////////////////////////////////////////////////////////////////// +// KMP SINGLE +//////////////////////////////////////////////////////////////////////////////// + +EXTERN int32_t __kmpc_single(kmp_Indent *loc, int32_t global_tid) { + PRINT0(LD_IO, "call kmpc_single\n"); + // decide to implement single with master; master get the single + return IsMaster(); +} + +EXTERN void __kmpc_end_single(kmp_Indent *loc, int32_t global_tid) { + PRINT0(LD_IO, "call kmpc_end_single\n"); + // decide to implement single with master: master get the single + ASSERT0(LT_FUSSY, IsMaster(), "expected only master here"); + // sync barrier is explicitely called... so that is not a problem +} + +//////////////////////////////////////////////////////////////////////////////// +// Flush +//////////////////////////////////////////////////////////////////////////////// + +EXTERN void __kmpc_flush(kmp_Indent *loc) { + PRINT0(LD_IO, "call kmpc_flush\n"); + __threadfence_block(); +} + +//////////////////////////////////////////////////////////////////////////////// +// Vote +//////////////////////////////////////////////////////////////////////////////// + +EXTERN int32_t __kmpc_warp_active_thread_mask() { + PRINT0(LD_IO, "call __kmpc_warp_active_thread_mask\n"); + return __ACTIVEMASK(); +} Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/task.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/task.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/task.cu @@ -0,0 +1,208 @@ +//===------------- task.h - NVPTX OpenMP tasks support ----------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Task implementation support. +// +// explicit task structure uses +// omptarget_nvptx task +// kmp_task +// +// where kmp_task is +// - klegacy_TaskDescr <- task pointer +// shared -> X +// routine +// part_id +// descr +// - private (of size given by task_alloc call). Accessed by +// task+sizeof(klegacy_TaskDescr) +// * private data * +// - shared: X. Accessed by shared ptr in klegacy_TaskDescr +// * pointer table to shared variables * +// - end +// +//===----------------------------------------------------------------------===// + +#include "omptarget-nvptx.h" + +EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc( + kmp_Indent *loc, // unused + uint32_t global_tid, // unused + int32_t flag, // unused (because in our impl, all are immediately exec + size_t sizeOfTaskInclPrivate, size_t sizeOfSharedTable, + kmp_TaskFctPtr taskSub) { + PRINT(LD_IO, + "call __kmpc_omp_task_alloc(size priv&struct %lld, shared %lld, " + "fct 0x%llx)\n", + P64(sizeOfTaskInclPrivate), P64(sizeOfSharedTable), P64(taskSub)); + // want task+priv to be a multiple of 8 bytes + size_t padForTaskInclPriv = PadBytes(sizeOfTaskInclPrivate, sizeof(void *)); + sizeOfTaskInclPrivate += padForTaskInclPriv; + size_t kmpSize = sizeOfTaskInclPrivate + sizeOfSharedTable; + ASSERT(LT_FUSSY, sizeof(omptarget_nvptx_TaskDescr) % sizeof(void *) == 0, + "need task descr of size %d to be a multiple of %d\n", + sizeof(omptarget_nvptx_TaskDescr), sizeof(void *)); + size_t totSize = sizeof(omptarget_nvptx_TaskDescr) + kmpSize; + omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = + (omptarget_nvptx_ExplicitTaskDescr *)SafeMalloc( + totSize, "explicit task descriptor"); + kmp_TaskDescr *newKmpTaskDescr = &newExplicitTaskDescr->kmpTaskDescr; + ASSERT0(LT_FUSSY, + (uint64_t)newKmpTaskDescr == + (uint64_t)ADD_BYTES(newExplicitTaskDescr, + sizeof(omptarget_nvptx_TaskDescr)), + "bad size assumptions"); + // init kmp_TaskDescr + newKmpTaskDescr->sharedPointerTable = + (void *)((char *)newKmpTaskDescr + sizeOfTaskInclPrivate); + newKmpTaskDescr->sub = taskSub; + newKmpTaskDescr->destructors = NULL; + PRINT(LD_TASK, "return with task descr kmp: 0x%llx, omptarget-nvptx 0x%llx\n", + P64(newKmpTaskDescr), P64(newExplicitTaskDescr)); + + return newKmpTaskDescr; +} + +EXTERN int32_t __kmpc_omp_task(kmp_Indent *loc, uint32_t global_tid, + kmp_TaskDescr *newKmpTaskDescr) { + return __kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0, + 0); +} + +EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Indent *loc, uint32_t global_tid, + kmp_TaskDescr *newKmpTaskDescr, + int32_t depNum, void *depList, + int32_t noAliasDepNum, + void *noAliasDepList) { + PRINT(LD_IO, "call to __kmpc_omp_task_with_deps(task 0x%llx)\n", + P64(newKmpTaskDescr)); + // 1. get explict task descr from kmp task descr + omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = + (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES( + newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr)); + ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr, + "bad assumptions"); + omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr; + ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr, + "bad assumptions"); + + // 2. push new context: update new task descriptor + int tid = GetLogicalThreadIdInBlock(); + omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid); + newTaskDescr->CopyForExplicitTask(parentTaskDescr); + // set new task descriptor as top + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, newTaskDescr); + + // 3. call sub + PRINT(LD_TASK, "call task sub 0x%llx(task descr 0x%llx)\n", + P64(newKmpTaskDescr->sub), P64(newKmpTaskDescr)); + newKmpTaskDescr->sub(0, newKmpTaskDescr); + PRINT(LD_TASK, "return from call task sub 0x%llx()\n", + P64(newKmpTaskDescr->sub)); + + // 4. pop context + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, + parentTaskDescr); + // 5. free + SafeFree(newExplicitTaskDescr, "explicit task descriptor"); + return 0; +} + +EXTERN void __kmpc_omp_task_begin_if0(kmp_Indent *loc, uint32_t global_tid, + kmp_TaskDescr *newKmpTaskDescr) { + PRINT(LD_IO, "call to __kmpc_omp_task_begin_if0(task 0x%llx)\n", + P64(newKmpTaskDescr)); + // 1. get explict task descr from kmp task descr + omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = + (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES( + newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr)); + ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr, + "bad assumptions"); + omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr; + ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr, + "bad assumptions"); + + // 2. push new context: update new task descriptor + int tid = GetLogicalThreadIdInBlock(); + omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid); + newTaskDescr->CopyForExplicitTask(parentTaskDescr); + // set new task descriptor as top + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, newTaskDescr); + // 3... noting to call... is inline + // 4 & 5 ... done in complete +} + +EXTERN void __kmpc_omp_task_complete_if0(kmp_Indent *loc, uint32_t global_tid, + kmp_TaskDescr *newKmpTaskDescr) { + PRINT(LD_IO, "call to __kmpc_omp_task_complete_if0(task 0x%llx)\n", + P64(newKmpTaskDescr)); + // 1. get explict task descr from kmp task descr + omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = + (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES( + newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr)); + ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr, + "bad assumptions"); + omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr; + ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr, + "bad assumptions"); + // 2. get parent + omptarget_nvptx_TaskDescr *parentTaskDescr = newTaskDescr->GetPrevTaskDescr(); + // 3... noting to call... is inline + // 4. pop context + int tid = GetLogicalThreadIdInBlock(); + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, + parentTaskDescr); + // 5. free + SafeFree(newExplicitTaskDescr, "explicit task descriptor"); +} + +EXTERN void __kmpc_omp_wait_deps(kmp_Indent *loc, uint32_t global_tid, + int32_t depNum, void *depList, + int32_t noAliasDepNum, void *noAliasDepList) { + PRINT0(LD_IO, "call to __kmpc_omp_wait_deps(..)\n"); + // nothing to do as all our tasks are executed as final +} + +EXTERN void __kmpc_taskgroup(kmp_Indent *loc, uint32_t global_tid) { + PRINT0(LD_IO, "call to __kmpc_taskgroup(..)\n"); + // nothing to do as all our tasks are executed as final +} + +EXTERN void __kmpc_end_taskgroup(kmp_Indent *loc, uint32_t global_tid) { + PRINT0(LD_IO, "call to __kmpc_end_taskgroup(..)\n"); + // nothing to do as all our tasks are executed as final +} + +EXTERN int32_t __kmpc_omp_taskyield(kmp_Indent *loc, uint32_t global_tid, + int end_part) { + PRINT0(LD_IO, "call to __kmpc_taskyield()\n"); + // do nothing: tasks are executed immediately, no yielding allowed + return 0; +} + +EXTERN int32_t __kmpc_omp_taskwait(kmp_Indent *loc, uint32_t global_tid) { + PRINT0(LD_IO, "call to __kmpc_taskwait()\n"); + // nothing to do as all our tasks are executed as final + return 0; +} + +EXTERN void __kmpc_taskloop(kmp_Indent *loc, uint32_t global_tid, + kmp_TaskDescr *newKmpTaskDescr, int if_val, + uint64_t *lb, uint64_t *ub, int64_t st, int nogroup, + int32_t sched, uint64_t grainsize, void *task_dup) { + + // skip task entirely if empty iteration space + if (*lb > *ub) + return; + + // the compiler has already stored lb and ub in the kmp_TaskDescr structure + // as we are using a single task to execute the entire loop, we can leave + // the initial task_t untouched + + __kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0, 0); +}