Index: External/CMakeLists.txt =================================================================== --- External/CMakeLists.txt +++ External/CMakeLists.txt @@ -1,4 +1,5 @@ add_subdirectory(CUDA) +add_subdirectory(HIP) add_subdirectory(HMMER) add_subdirectory(Nurbs) add_subdirectory(Povray) Index: External/CUDA/CMakeLists.txt =================================================================== --- External/CUDA/CMakeLists.txt +++ External/CUDA/CMakeLists.txt @@ -1,4 +1,5 @@ include(External) +include(Variant) llvm_externals_find(TEST_SUITE_CUDA_ROOT "cuda" "CUDA prerequisites") set(SUPPORTED_GPU_CUDA_7_0 @@ -30,83 +31,36 @@ set(SUPPORTED_GPU_CUDA_11_0 ${SUPPORTED_GPU_CUDA_10_2} sm_80) -# Helper macro to extract version number at the end of the string -# Input: get_version(Var String) -# Where String = /some/string/with/version-x.y.z -# Output: -# Sets Var=x.y.z -macro(get_version Var Path) - string(REGEX MATCH "[0-9]+(\\.[0-9]+)*$" ${Var} ${Path}) -endmacro (get_version) - -# Helper function to glob CUDA source files and set LANGUAGE property -# to CXX on each of them. Sets Var in parent scope to the list of -# files found. -macro(cuda_glob Var) - file(GLOB FileList ${ARGN}) - foreach(File IN LISTS FileList) - if(${File} MATCHES ".*\.cu$") - set_source_files_properties(${File} PROPERTIES LANGUAGE CXX) - endif() - endforeach() - set(${Var} ${FileList}) -endmacro(cuda_glob) - -macro(create_one_local_test_f Name FileGlob FilterRegex) - if (${VariantSuffix} MATCHES ${FilterRegex}) - cuda_glob(_sources ${FileGlob}) - set(_executable ${Name}-${VariantSuffix}) - set(_executable_path ${CMAKE_CURRENT_BINARY_DIR}/${_executable}) - llvm_test_run() - set(REFERENCE_OUTPUT) - # Verify reference output if it exists. - if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${Name}.reference_output) - set(REFERENCE_OUTPUT ${Name}.reference_output) - llvm_test_verify(WORKDIR ${CMAKE_CURRENT_BINARY_DIR} - ${FPCMP} %o ${REFERENCE_OUTPUT}-${VariantSuffix} - ) - llvm_test_executable(${_executable} ${_sources}) - llvm_test_data(${_executable} - DEST_SUFFIX "-${VariantSuffix}" - ${REFERENCE_OUTPUT}) - else() - llvm_test_executable(${_executable} ${_sources}) - endif() - target_compile_options(${_executable} PUBLIC ${VariantCPPFLAGS}) - if(VariantLibs) - target_link_libraries(${_executable} ${VariantLibs}) - endif() - add_dependencies(cuda-tests-simple-${VariantSuffix} ${_executable}) - # Local tests are presumed to be fast. - list(APPEND CUDA_SIMPLE_TEST_TARGETS ${_executable}.test) - endif() -endmacro() - -macro(create_one_local_test Name FileGlob) - create_one_local_test_f(${Name} ${FileGlob} ".*") -endmacro() - # Create targets for CUDA tests that are part of the test suite. macro(create_local_cuda_tests VariantSuffix) - create_one_local_test(assert assert.cu) - create_one_local_test(axpy axpy.cu) - create_one_local_test(algorithm algorithm.cu) - create_one_local_test(cmath cmath.cu) - create_one_local_test(complex complex.cu) - create_one_local_test(math_h math_h.cu) - create_one_local_test(new new.cu) - create_one_local_test(empty empty.cu) - create_one_local_test(printf printf.cu) - create_one_local_test(future future.cu) - create_one_local_test(builtin_var builtin_var.cu) - # We only need SIMD tests on CUDA-8.0 to verivy that our reference is correct + set(VariantOffload "cuda") + list(APPEND CUDA_LOCAL_TESTS assert) + list(APPEND CUDA_LOCAL_TESTS axpy) + list(APPEND CUDA_LOCAL_TESTS algorithm) + list(APPEND CUDA_LOCAL_TESTS cmath) + list(APPEND CUDA_LOCAL_TESTS complex) + list(APPEND CUDA_LOCAL_TESTS math_h) + list(APPEND CUDA_LOCAL_TESTS new) + list(APPEND CUDA_LOCAL_TESTS empty) + list(APPEND CUDA_LOCAL_TESTS printf) + list(APPEND CUDA_LOCAL_TESTS future) + list(APPEND CUDA_LOCAL_TESTS builtin_var) + list(APPEND CUDA_LOCAL_TESTS test_round) + foreach(_cuda_test IN LISTS CUDA_LOCAL_TESTS) + create_one_local_test(${_cuda_test} ${_cuda_test}.cu + ${VariantOffload} ${VariantSuffix} + "${VariantCPPFLAGS}" "${VariantLibs}") + endforeach() + + # We only need SIMD tests on CUDA-8.0 to verify that our reference is correct # and matches NVIDIA-provided one. and on CUDA-9.2 to verify that clang's # implementation matches the reference. This test also happens to be the # longest one, so by not running unnecessary instances we speed up cuda # buildbot a lot. create_one_local_test_f(simd simd.cu - "cuda-(8[.]0|9[.]2)-c[+][+]11-libc[+][+]") - create_one_local_test(test_round test_round.cu) + "cuda-(8[.]0|9[.]2)-c[+][+]11-libc[+][+]" + ${VariantOffload} ${VariantSuffix} + "${VariantCPPFLAGS}" "${VariantLibs}") endmacro() macro(thrust_make_test_name TestName TestSourcePath) @@ -196,7 +150,7 @@ # Target for CUDA tests that take little time to build and run. add_custom_target(check-cuda-simple-${VariantSuffix} COMMAND ${TEST_SUITE_LIT} ${TEST_SUITE_LIT_FLAGS} - ${CUDA_SIMPLE_TEST_TARGETS} + ${VARIANT_SIMPLE_TEST_TARGETS} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} DEPENDS cuda-tests-simple-${VariantSuffix} USES_TERMINAL) Index: External/HIP/CMakeLists.txt =================================================================== --- /dev/null +++ External/HIP/CMakeLists.txt @@ -0,0 +1,98 @@ +include(External) +include(Variant) +llvm_externals_find(TEST_SUITE_HIP_ROOT "hip" "HIP prerequisites") + +# Create targets for HIP tests that are part of the test suite. +macro(create_local_hip_tests VariantSuffix) + set(VariantOffload "hip") + # Add HIP tests to be added to hip-tests-simple + list(APPEND HIP_LOCAL_TESTS empty) + list(APPEND HIP_LOCAL_TESTS saxpy) + foreach(_hip_test IN LISTS HIP_LOCAL_TESTS) + create_one_local_test(${_hip_test} ${_hip_test}.hip + ${VariantOffload} ${VariantSuffix} + "${VariantCPPFLAGS}" "${VariantLibs}") + endforeach() +endmacro() + +function(create_hip_test VariantSuffix) + message(STATUS "Creating HIP test variant ${VariantSuffix}") + add_custom_target(hip-tests-simple-${VariantSuffix} + COMMENT "Build HIP test variant ${VariantSuffix}") + + set(VariantCPPFLAGS ${_HIP_CPPFLAGS}) + set(VariantLibs ${_HIP_Libs}) + list(APPEND LDFLAGS ${_HIP_LDFLAGS}) + + create_local_hip_tests(${VariantSuffix}) + add_dependencies(hip-tests-simple hip-tests-simple-${VariantSuffix}) + + add_custom_target(check-hip-simple-${VariantSuffix} + COMMAND ${TEST_SUITE_LIT} ${TEST_SUITE_LIT_FLAGS} + ${VARIANT_SIMPLE_TEST_TARGETS} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS hip-tests-simple-${VariantSuffix} + USES_TERMINAL) + add_dependencies(check-hip-simple check-hip-simple-${VariantSuffix}) +endfunction(create_hip_test) + +macro(create_hip_tests) + # Find all rocm installations at Externals/hip/ directory. + # For ROCm, the path looks like rocm-4.1.0 + message(STATUS "Checking HIP prerequisites in ${TEST_SUITE_HIP_ROOT}") + file(GLOB RocmVersions ${TEST_SUITE_HIP_ROOT}/rocm-*) + list(SORT RocmVersions) + foreach(RocmDir IN LISTS RocmVersions) + get_version(RocmVersion ${RocmDir}) + message(STATUS "Found ROCm ${RocmVersion}") + list(APPEND ROCM_PATHS ${RocmDir}) + add_library(amdhip64-${RocmVersion} SHARED IMPORTED) + set_property(TARGET amdhip64-${RocmVersion} PROPERTY IMPORTED_LOCATION + ${RocmDir}/lib/libamdhip64.so) + endforeach(RocmDir) + + if(NOT ROCM_PATHS) + message(SEND_ERROR + "There are no ROCm installations in ${TEST_SUITE_HIP_ROOT}") + return() + endif() + + add_custom_target(hip-tests-simple + COMMENT "Build all simple HIP tests") + add_custom_target(check-hip-simple + COMMENT "Run all simple HIP tests") + + if(NOT AMDGPU_ARCHS) + list(APPEND AMDGPU_ARCHS "gfx906;gfx908") + endif() + + foreach(_RocmPath ${ROCM_PATHS}) + get_version(_RocmVersion ${_RocmPath}) + set(_HIP_Suffix "hip-${_RocmVersion}") + # Set up HIP test flags + set(_HIP_CPPFLAGS -xhip --hip-device-lib-path=${_RocmPath}/amdgcn/bitcode + -I${_RocmPath}/include) + set(_HIP_LDFLAGS -L${_RocmPath}/lib -lamdhip64) + set(_HIP_Libs amdhip64-${RocmVersion}) + + # Unset these for each iteration of rocm path. + set(_ArchFlags) + set(_ArchList) + foreach(_AMDGPUArch IN LISTS AMDGPU_ARCHS) + list(APPEND _ArchFlags --offload-arch=${_AMDGPUArch}) + endforeach() + message(STATUS "Building ${_RocmPath} targets for ${AMDGPU_ARCHS}") + list(APPEND _HIP_CPPFLAGS ${_ArchFlags}) + + create_hip_test(${_HIP_Suffix}) + endforeach() + + add_custom_target(hip-tests-all DEPENDS hip-tests-simple + COMMENT "Build all HIP tests.") + + file(COPY lit.local.cfg DESTINATION "${CMAKE_CURRENT_BINARY_DIR}") +endmacro(create_hip_tests) + +if(TEST_SUITE_HIP_ROOT) + create_hip_tests() +endif() Index: External/HIP/README =================================================================== --- /dev/null +++ External/HIP/README @@ -0,0 +1,30 @@ +HIP Tests +========== + +HIP tests are enabled if cmake is invoked with +-DTEST_SUITE_EXTERNALS_DIR= and specified externals +directory contains at least one ROCm installation. + +Expected externals directory structure: +Externals/ + hip/ + rocm-X.Y.Z/ -- One or more ROCm installation. + +export EXTERNAL_DIR=/your/Externals/path +export AMDGPU_ARCHS=gfx906;gfx908 # List of AMDGPU archs to compile +export CLANG_DIR=/your/clang/build/dir +export TEST_SUITE_DIR=/path/to/test-suite-sources + +Configure, build and run tests: + +``` +$ mkdir test-suite-build-dir +$ cd test-suite-build-dir +$ PATH=$CLANG_DIR/bin:$PATH CXX=clang++ CC=clang cmake -G Ninja -DTEST_SUITE_EXTERNALS_DIR=$EXTERNAL_DIR -DAMDGPU_ARCHS=$AMDGPU_ARCHS -DCMAKE_CXX_COMPILER="$CLANG_DIR/bin/clang++" -DCMAKE_C_COMPILER="$CLANG_DIR/bin/clang" $TEST_SUITE_DIR +$ ninja hip-tests-simple +$ ninja check-hip-simple +``` + +This will build every test for each of the installed ROCm in the +$EXTERNAL_DIR/hip location, and run them against the expected +reference_output. Index: External/HIP/empty.hip =================================================================== --- /dev/null +++ External/HIP/empty.hip @@ -0,0 +1 @@ +int main(int argc, char **argv) { return 0; } Index: External/HIP/empty.reference_output =================================================================== --- /dev/null +++ External/HIP/empty.reference_output @@ -0,0 +1 @@ +exit 0 Index: External/HIP/lit.local.cfg =================================================================== --- /dev/null +++ External/HIP/lit.local.cfg @@ -0,0 +1,14 @@ +# -*- python -*- + +import os + +hip_env_vars = [ + 'HIP_VISIBLE_DEVICES', + 'LD_LIBRARY_PATH', +] + +for var in hip_env_vars: + if var in os.environ: + config.environment[var] = os.environ[var] + +config.traditional_output = True Index: External/HIP/saxpy.hip =================================================================== --- /dev/null +++ External/HIP/saxpy.hip @@ -0,0 +1,62 @@ +#include + +#include + +#define N (1024 * 500) + +__global__ void saxpy(float a, float* x, float* y) { + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < N) y[tid] = a * x[tid] + y[tid]; +} + +int main() { + + const float a = 100.0f; + float* x = (float*)malloc(N * sizeof(float)); + float* y = (float*)malloc(N * sizeof(float)); + + // Initialize the input data. + for (size_t i = 0; i < N; ++i) { + x[i] = static_cast(i); + y[i] = static_cast(i * 2); + } + + // Make a copy for the GPU implementation. + float* d_x; + float* d_y; + hipMalloc((void**)&d_x, N * sizeof(float)); + hipMalloc((void**)&d_y, N * sizeof(float)); + hipMemcpy(d_x, x, N * sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(d_y, y, N * sizeof(float), hipMemcpyHostToDevice); + + // CPU implementation of saxpy. + for (int i = 0; i < N; i++) { + y[i] = a * x[i] + y[i]; + } + + // Launch a GPU kernel to compute the saxpy. + saxpy<<<(N+255)/256, 256>>>(a, d_x, d_y); + + // Copy the device results to host. + float* h_y = (float*)malloc(N * sizeof(float)); + hipDeviceSynchronize(); + hipMemcpy(h_y, d_y, N * sizeof(float), hipMemcpyDeviceToHost); + + // Verify the results match CPU. + int errors = 0; + for (int i = 0; i < N; i++) { + if (fabs(y[i] - h_y[i]) > fabs(y[i] * 0.0001f)) + errors++; + } + if (errors != 0) + std::cout << errors << " errors" << std::endl; + else + std::cout << "PASSED!" << std::endl; + + free(h_y); + free(x); + free(y); + hipFree(d_x); + hipFree(d_y); + return errors; +} Index: External/HIP/saxpy.reference_output =================================================================== --- /dev/null +++ External/HIP/saxpy.reference_output @@ -0,0 +1,2 @@ +PASSED! +exit 0 Index: cmake/modules/Variant.cmake =================================================================== --- /dev/null +++ cmake/modules/Variant.cmake @@ -0,0 +1,72 @@ +include(External) + +# Helper macro to extract version number at the end of the string +# Input: get_version(Var String) +# Where String = /some/string/with/version-x.y.z +# Output: +# Sets Var=x.y.z +macro(get_version Var Path) + string(REGEX MATCH "[0-9]+(\\.[0-9]+)*$" ${Var} ${Path}) +endmacro (get_version) + +# Helper function to glob CUDA source files and set LANGUAGE property +# to CXX on each of them. Sets Var in parent scope to the list of +# files found. +macro(gpu_glob Var) + file(GLOB FileList ${ARGN}) + foreach(File IN LISTS FileList) + if(${File} MATCHES ".*\.cu$" OR ${File} MATCHES ".*\.hip$") + set_source_files_properties(${File} PROPERTIES LANGUAGE CXX) + endif() + endforeach() + set(${Var} ${FileList}) +endmacro(gpu_glob) + +macro(create_one_local_test_f Name FileGlob FilterRegex + VariantOffload VariantSuffix + VariantCPPFLAGS VariantLibs) + if (${VariantSuffix} MATCHES ${FilterRegex}) + gpu_glob(_sources ${FileGlob}) + set(_executable ${Name}-${VariantSuffix}) + set(_executable_path ${CMAKE_CURRENT_BINARY_DIR}/${_executable}) + llvm_test_run() + set(REFERENCE_OUTPUT) + # Verify reference output if it exists. + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${Name}.reference_output) + set(REFERENCE_OUTPUT ${Name}.reference_output) + llvm_test_verify(WORKDIR ${CMAKE_CURRENT_BINARY_DIR} + ${FPCMP} %o ${REFERENCE_OUTPUT}-${VariantSuffix} + ) + llvm_test_executable(${_executable} ${_sources}) + llvm_test_data(${_executable} + DEST_SUFFIX "-${VariantSuffix}" + ${REFERENCE_OUTPUT}) + else() + llvm_test_executable(${_executable} ${_sources}) + endif() + target_compile_options(${_executable} PUBLIC ${VariantCPPFLAGS}) + if(VariantLibs) + target_link_libraries(${_executable} ${VariantLibs}) + endif() + if (${VariantOffload} MATCHES "hip") + add_dependencies(hip-tests-simple-${VariantSuffix} ${_executable}) + else() + add_dependencies(cuda-tests-simple-${VariantSuffix} ${_executable}) + endif() + # Local tests are presumed to be fast. + list(APPEND VARIANT_SIMPLE_TEST_TARGETS ${_executable}.test) + endif() +endmacro() + +# Helper macro to create a local test for a VariantSuffix. +# Inputs: Name, File, Offload, Suffix, CPPFLAGS, Libs +# Output: Dependencies added for -tests-simple-, +# and VARIANT_SIMPLE_TEST_TARGETS created. +macro(create_one_local_test Name FileGlob + VariantOffload VariantSuffix + VariantCPPFLAGS VariantLibs) + create_one_local_test_f(${Name} ${FileGlob} ".*" + ${VariantOffload} ${VariantSuffix} + "${VariantCPPFLAGS}" "${VariantLibs}") +endmacro() +