diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -2467,10 +2467,6 @@ Group, Flags<[CC1Option, NoArgumentUnused, HelpHidden]>; def fno_openmp_assume_threads_oversubscription : Flag<["-"], "fno-openmp-assume-threads-oversubscription">, Group, Flags<[CC1Option, NoArgumentUnused, HelpHidden]>; -defm openmp_target_new_runtime: BoolFOption<"openmp-target-new-runtime", - LangOpts<"OpenMPTargetNewRuntime">, DefaultTrue, - PosFlag, - NegFlag>; defm openmp_optimistic_collapse : BoolFOption<"openmp-optimistic-collapse", LangOpts<"OpenMPOptimisticCollapse">, DefaultFalse, PosFlag, NegFlag, BothFlags<[NoArgumentUnused, HelpHidden]>>; diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -1203,8 +1203,7 @@ llvm_unreachable("OpenMP can only handle device code."); llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder(); - if (CGM.getLangOpts().OpenMPTargetNewRuntime && - !CGM.getLangOpts().OMPHostIRFile.empty()) { + if (!CGM.getLangOpts().OMPHostIRFile.empty()) { OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug, "__omp_rtl_debug_kind"); OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription, diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp --- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp @@ -290,11 +290,7 @@ return; std::string BitcodeSuffix; - if (DriverArgs.hasFlag(options::OPT_fopenmp_target_new_runtime, - options::OPT_fno_openmp_target_new_runtime, true)) - BitcodeSuffix = "new-amdgpu-" + GPUArch; - else - BitcodeSuffix = "amdgcn-" + GPUArch; + BitcodeSuffix = "amdgcn-" + GPUArch; addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, BitcodeSuffix, getTriple()); diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -5936,13 +5936,6 @@ options::OPT_fno_openmp_cuda_mode, /*Default=*/false)) CmdArgs.push_back("-fopenmp-cuda-mode"); - // When in OpenMP offloading mode, enable or disable the new device - // runtime. - if (Args.hasFlag(options::OPT_fopenmp_target_new_runtime, - options::OPT_fno_openmp_target_new_runtime, - /*Default=*/true)) - CmdArgs.push_back("-fopenmp-target-new-runtime"); - // When in OpenMP offloading mode, enable debugging on the device. Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_target_debug_EQ); if (Args.hasFlag(options::OPT_fopenmp_target_debug, @@ -8172,9 +8165,6 @@ StringRef Arch = TCArgs.getLastArgValue(options::OPT_march_EQ); std::string BitcodeSuffix; - if (TCArgs.hasFlag(options::OPT_fopenmp_target_new_runtime, - options::OPT_fno_openmp_target_new_runtime, true)) - BitcodeSuffix += "new-"; if (TC->getTriple().isNVPTX()) BitcodeSuffix += "nvptx-"; else if (TC->getTriple().isAMDGPU()) diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -749,11 +749,7 @@ return; std::string BitcodeSuffix; - if (DriverArgs.hasFlag(options::OPT_fopenmp_target_new_runtime, - options::OPT_fno_openmp_target_new_runtime, true)) - BitcodeSuffix = "new-nvptx-" + GpuArch.str(); - else - BitcodeSuffix = "nvptx-" + GpuArch.str(); + BitcodeSuffix = "nvptx-" + GpuArch.str(); addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, BitcodeSuffix, getTriple()); diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -3484,9 +3484,6 @@ GenerateArg(Args, OPT_fopenmp_version_EQ, Twine(Opts.OpenMP), SA); } - if (Opts.OpenMPTargetNewRuntime) - GenerateArg(Args, OPT_fopenmp_target_new_runtime, SA); - if (Opts.OpenMPThreadSubscription) GenerateArg(Args, OPT_fopenmp_assume_threads_oversubscription, SA); @@ -3877,9 +3874,6 @@ Opts.OpenMP && Args.hasArg(options::OPT_fopenmp_enable_irbuilder); bool IsTargetSpecified = Opts.OpenMPIsDevice || Args.hasArg(options::OPT_fopenmp_targets_EQ); - Opts.OpenMPTargetNewRuntime = - Opts.OpenMPIsDevice && - Args.hasArg(options::OPT_fopenmp_target_new_runtime); Opts.ConvergentFunctions = Opts.ConvergentFunctions || Opts.OpenMPIsDevice; @@ -3927,17 +3921,13 @@ // Set either by a specific value or to a default if not specified. if (Opts.OpenMPIsDevice && (Args.hasArg(OPT_fopenmp_target_debug) || Args.hasArg(OPT_fopenmp_target_debug_EQ))) { - if (Opts.OpenMPTargetNewRuntime) { Opts.OpenMPTargetDebug = getLastArgIntValue( Args, OPT_fopenmp_target_debug_EQ, Opts.OpenMPTargetDebug, Diags); if (!Opts.OpenMPTargetDebug && Args.hasArg(OPT_fopenmp_target_debug)) Opts.OpenMPTargetDebug = 1; - } else { - Diags.Report(diag::err_drv_debug_no_new_runtime); - } } - if (Opts.OpenMPIsDevice && Opts.OpenMPTargetNewRuntime) { + if (Opts.OpenMPIsDevice) { if (Args.hasArg(OPT_fopenmp_assume_teams_oversubscription)) Opts.OpenMPTeamSubscription = true; if (Args.hasArg(OPT_fopenmp_assume_threads_oversubscription)) diff --git a/clang/test/Driver/amdgpu-openmp-toolchain.c b/clang/test/Driver/amdgpu-openmp-toolchain.c --- a/clang/test/Driver/amdgpu-openmp-toolchain.c +++ b/clang/test/Driver/amdgpu-openmp-toolchain.c @@ -1,6 +1,6 @@ // REQUIRES: x86-registered-target // REQUIRES: amdgpu-registered-target -// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib %s 2>&1 \ +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib %s 2>&1 \ // RUN: | FileCheck %s // verify the tools invocations @@ -14,7 +14,7 @@ // CHECK: clang{{.*}}"-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-o" "{{.*}}a-{{.*}}.o" "-x" "ir" "{{.*}}a-{{.*}}.bc" // CHECK: ld{{.*}}"-o" "a.out"{{.*}}"{{.*}}amdgpu-openmp-toolchain-{{.*}}.o" "{{.*}}a-{{.*}}.o" "-lomp" "-lomptarget" -// RUN: %clang -ccc-print-phases --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 %s 2>&1 \ +// RUN: %clang -ccc-print-phases --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 %s 2>&1 \ // RUN: | FileCheck --check-prefix=CHECK-PHASES %s // phases // CHECK-PHASES: 0: input, "{{.*}}amdgpu-openmp-toolchain.c", c, (host-openmp) @@ -36,13 +36,13 @@ // CHECK-PHASES: 16: linker, {4, 15}, image, (host-openmp) // handling of --libomptarget-amdgcn-bc-path -// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib/libomptarget-amdgcn-gfx803.bc %s 2>&1 | FileCheck %s --check-prefix=CHECK-LIBOMPTARGET +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib/libomptarget-amdgcn-gfx803.bc %s 2>&1 | FileCheck %s --check-prefix=CHECK-LIBOMPTARGET // CHECK-LIBOMPTARGET: clang{{.*}}"-cc1"{{.*}}"-triple" "amdgcn-amd-amdhsa"{{.*}}"-target-cpu" "gfx803" "-fcuda-is-device" "-mlink-builtin-bitcode"{{.*}}Inputs/hip_dev_lib/libomptarget-amdgcn-gfx803.bc"{{.*}} -// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-NOGPULIB +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-NOGPULIB // CHECK-NOGPULIB-NOT: clang{{.*}}"-cc1"{{.*}}"-triple" "amdgcn-amd-amdhsa"{{.*}}"-target-cpu" "gfx803" "-fcuda-is-device" "-mlink-builtin-bitcode"{{.*}}libomptarget-amdgcn-gfx803.bc"{{.*}} -// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -save-temps -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-PRINT-BINDINGS +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -save-temps -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-PRINT-BINDINGS // CHECK-PRINT-BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT:.*]]"], // CHECK-PRINT-BINDINGS: "x86_64-unknown-linux-gnu" - "clang",{{.*}} output: "[[HOST_BC:.*]]" // CHECK-PRINT-BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]"], output: "[[HOST_S:.*]]" @@ -56,13 +56,13 @@ // CHECK-PRINT-BINDINGS: "x86_64-unknown-linux-gnu" - "GNU::Linker", inputs: ["[[HOST_O]]", "[[OFFLOAD_O]]"], output: // verify the llc is invoked for textual assembly output -// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib -save-temps %s 2>&1 \ +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib -save-temps %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=CHECK-SAVE-ASM // CHECK-SAVE-ASM: llc{{.*}}amdgpu-openmp-toolchain-{{.*}}-gfx906-linked.bc" "-mtriple=amdgcn-amd-amdhsa" "-mcpu=gfx906" "-filetype=asm" "-o"{{.*}}amdgpu-openmp-toolchain-{{.*}}-gfx906.s" // CHECK-SAVE-ASM: llc{{.*}}amdgpu-openmp-toolchain-{{.*}}-gfx906-linked.bc" "-mtriple=amdgcn-amd-amdhsa" "-mcpu=gfx906" "-filetype=obj" "-o"{{.*}}amdgpu-openmp-toolchain-{{.*}}-gfx906.o" // check the handling of -c -// RUN: %clang -ccc-print-bindings -c --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib -save-temps %s 2>&1 \ +// RUN: %clang -ccc-print-bindings -c --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib -save-temps %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=CHECK-C // CHECK-C: "x86_64-unknown-linux-gnu" - "clang", // CHECK-C: "x86_64-unknown-linux-gnu" - "clang",{{.*}}output: "[[HOST_BC:.*]]" @@ -72,8 +72,8 @@ // CHECK-C: "x86_64-unknown-linux-gnu" - "clang::as" // CHECK-C: "x86_64-unknown-linux-gnu" - "offload bundler" -// RUN: %clang -### --target=x86_64-unknown-linux-gnu -emit-llvm -S -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-EMIT-LLVM-IR +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -emit-llvm -S -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-EMIT-LLVM-IR // CHECK-EMIT-LLVM-IR: clang{{.*}}"-cc1"{{.*}}"-triple" "amdgcn-amd-amdhsa"{{.*}}"-emit-llvm" -// RUN: %clang -### -target x86_64-pc-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -lm --rocm-device-lib-path=%S/Inputs/rocm/amdgcn/bitcode %s 2>&1 | FileCheck %s --check-prefix=CHECK-LIB-DEVICE +// RUN: %clang -### -target x86_64-pc-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -lm --rocm-device-lib-path=%S/Inputs/rocm/amdgcn/bitcode %s 2>&1 | FileCheck %s --check-prefix=CHECK-LIB-DEVICE // CHECK-LIB-DEVICE: {{.*}}llvm-link{{.*}}ocml.bc"{{.*}}ockl.bc"{{.*}}oclc_daz_opt_on.bc"{{.*}}oclc_unsafe_math_off.bc"{{.*}}oclc_finite_only_off.bc"{{.*}}oclc_correctly_rounded_sqrt_on.bc"{{.*}}oclc_wavefrontsize64_on.bc"{{.*}}oclc_isa_version_803.bc" diff --git a/clang/test/Driver/openmp-offload-gpu.c b/clang/test/Driver/openmp-offload-gpu.c --- a/clang/test/Driver/openmp-offload-gpu.c +++ b/clang/test/Driver/openmp-offload-gpu.c @@ -155,43 +155,24 @@ // RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ // RUN: --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget/libomptarget-nvptx-test.bc \ // RUN: -Xopenmp-target -march=sm_35 --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \ -// RUN: -fopenmp-relocatable-target -fopenmp-target-new-runtime -save-temps -no-canonical-prefixes %s 2>&1 \ +// RUN: -fopenmp-relocatable-target -save-temps -no-canonical-prefixes %s 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-BCLIB %s /// Specify the directory containing the bitcode lib, check clang picks the right one // RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ // RUN: --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget \ // RUN: -Xopenmp-target -march=sm_35 --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \ -// RUN: -fopenmp-relocatable-target -fno-openmp-target-new-runtime -save-temps \ +// RUN: -fopenmp-relocatable-target -save-temps \ // RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB-DIR %s -/// Check with the new runtime enabled -// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ -// RUN: -Xopenmp-target -march=sm_35 --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \ -// RUN: -fopenmp-relocatable-target -fopenmp-target-new-runtime \ -// RUN: --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget/libomptarget-new-nvptx-test.bc \ -// RUN: -save-temps -no-canonical-prefixes %s 2>&1 \ -// RUN: | FileCheck -check-prefix=CHK-BCLIB-NEW %s - -/// Check with new runtime and specifying the directory -// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ -// RUN: -Xopenmp-target -march=sm_35 --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \ - -// RUN: -fopenmp-relocatable-target -fopenmp-target-new-runtime \ -// RUN: --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget -save-temps \ -// RUN: -no-canonical-prefixes %s 2>&1 \ -// RUN: | FileCheck -check-prefix=CHK-BCLIB-NEW-DIR %s - /// Create a bogus bitcode library and find it with LIBRARY_PATH // RUN: env LIBRARY_PATH=%S/Inputs/libomptarget/subdir %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ // RUN: -Xopenmp-target -march=sm_35 --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \ -// RUN: -fopenmp-relocatable-target -fno-openmp-target-new-runtime -save-temps \ +// RUN: -fopenmp-relocatable-target -save-temps \ // RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-ENV-BCLIB %s // CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-nvptx-test.bc // CHK-BCLIB-DIR: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget{{/|\\\\}}libomptarget-nvptx-sm_35.bc -// CHK-BCLIB-NEW: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-new-nvptx-test.bc -// CHK-BCLIB-NEW-DIR: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget{{/|\\\\}}libomptarget-new-nvptx-sm_35.bc // CHK-ENV-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}subdir{{/|\\\\}}libomptarget-nvptx-sm_35.bc // CHK-BCLIB-NOT: {{error:|warning:}} @@ -204,7 +185,7 @@ // RUN: -fopenmp-relocatable-target -save-temps -no-canonical-prefixes %s 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-BCLIB-WARN %s -// CHK-BCLIB-WARN: no library 'libomptarget-new-nvptx-sm_35.bc' found in the default clang lib directory or in LIBRARY_PATH; use '--libomptarget-nvptx-bc-path' to specify nvptx bitcode library +// CHK-BCLIB-WARN: no library 'libomptarget-nvptx-sm_35.bc' found in the default clang lib directory or in LIBRARY_PATH; use '--libomptarget-nvptx-bc-path' to specify nvptx bitcode library /// ########################################################################### diff --git a/clang/test/OpenMP/target_globals_codegen.cpp b/clang/test/OpenMP/target_globals_codegen.cpp --- a/clang/test/OpenMP/target_globals_codegen.cpp +++ b/clang/test/OpenMP/target_globals_codegen.cpp @@ -1,12 +1,12 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals --global-value-regex "__omp_rtl_" // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-new-runtime -fopenmp-target-debug -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-new-runtime -fopenmp-target-debug=111 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-EQ -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-new-runtime -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-DEFAULT -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-new-runtime -fopenmp-assume-threads-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-THREADS -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-new-runtime -fopenmp-assume-teams-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-TEAMS -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-new-runtime -fopenmp-assume-teams-oversubscription -fopenmp-is-device -o - | FileCheck %s --check-prefix=CHECK-RUNTIME +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-debug -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-debug=111 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-EQ +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-DEFAULT +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-threads-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-THREADS +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-teams-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-TEAMS +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-teams-oversubscription -fopenmp-is-device -o - | FileCheck %s --check-prefix=CHECK-RUNTIME // expected-no-diagnostics #ifndef HEADER diff --git a/openmp/libomptarget/CMakeLists.txt b/openmp/libomptarget/CMakeLists.txt --- a/openmp/libomptarget/CMakeLists.txt +++ b/openmp/libomptarget/CMakeLists.txt @@ -38,13 +38,11 @@ # This is a list of all the targets that are supported/tested right now. set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} aarch64-unknown-linux-gnu") set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} amdgcn-amd-amdhsa") -set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} amdgcn-amd-amdhsa-newRTL") set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} amdgcn-amd-amdhsa-newDriver") set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64le-ibm-linux-gnu") set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64-ibm-linux-gnu") set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} x86_64-pc-linux-gnu") set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda") -set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda-newRTL") set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda-newDriver") # Once the plugins for the different targets are validated, they will be added to @@ -81,7 +79,6 @@ # Build offloading plugins and device RTLs if they are available. add_subdirectory(plugins) -add_subdirectory(deviceRTLs) add_subdirectory(DeviceRTL) add_subdirectory(tools) diff --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt --- a/openmp/libomptarget/DeviceRTL/CMakeLists.txt +++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt @@ -180,7 +180,7 @@ list(APPEND bc_files ${outfile}) endforeach() - set(bclib_name "libomptarget-new-${target_name}-${target_cpu}.bc") + set(bclib_name "libomptarget-${target_name}-${target_cpu}.bc") # Link to a bitcode library. add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/linked_${bclib_name} @@ -212,7 +212,7 @@ set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name}) - set(bclib_target_name "omptarget-new-${target_name}-${target_cpu}-bc") + set(bclib_target_name "omptarget-${target_name}-${target_cpu}-bc") add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}) diff --git a/openmp/libomptarget/deviceRTLs/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/CMakeLists.txt deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/CMakeLists.txt +++ /dev/null @@ -1,14 +0,0 @@ -##===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -# ##===----------------------------------------------------------------------===## -# -# Build a device RTL for each available machine. -# -##===----------------------------------------------------------------------===## - -add_subdirectory(amdgcn) -add_subdirectory(nvptx) diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt +++ /dev/null @@ -1,193 +0,0 @@ -##===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -##===----------------------------------------------------------------------===## -# -# Build the AMDGCN Device RTL bitcode library using clang -ffreestanding -# -##===----------------------------------------------------------------------===## - -set(LIBOMPTARGET_BUILD_AMDGCN_BCLIB FALSE CACHE BOOL - "Can be set to true to enable building this library.") - -if (NOT LIBOMPTARGET_BUILD_AMDGCN_BCLIB) - libomptarget_say("Not building AMDGCN device RTL: Disabled by LIBOMPTARGET_BUILD_AMDGCN_BCLIB") - return() -endif() - -if (NOT LIBOMPTARGET_LLVM_INCLUDE_DIRS) - libomptarget_say("Not building AMDGCN device RTL: Missing definition for LIBOMPTARGET_LLVM_INCLUDE_DIRS") - return() -endif() - - -# Copied from nvptx CMakeLists -if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64") - set(aux_triple x86_64-unknown-linux-gnu) -elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ppc64le") - set(aux_triple powerpc64le-unknown-linux-gnu) -elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64") - set(aux_triple aarch64-unknown-linux-gnu) -else() - libomptarget_say("Not building AMDGCN device RTL: unknown host arch: ${CMAKE_HOST_SYSTEM_PROCESSOR}") - return() -endif() - -if (LLVM_DIR) - # Builds that use pre-installed LLVM have LLVM_DIR set. - find_program(CLANG_TOOL clang PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH) - find_program(LINK_TOOL llvm-link PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH) - find_program(OPT_TOOL opt PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH) - if ((NOT CLANG_TOOL) OR (NOT LINK_TOOL) OR (NOT OPT_TOOL)) - libomptarget_say("Not building AMDGCN device RTL. Missing clang: ${CLANG_TOOL}, llvm-link: ${LINK_TOOL} or opt: ${OPT_TOOL}") - return() - else() - libomptarget_say("Building AMDGCN device RTL. Using clang: ${CLANG_TOOL}, llvm-link: ${LINK_TOOL} and opt: ${OPT_TOOL}") - endif() -elseif (LLVM_TOOL_CLANG_BUILD AND NOT CMAKE_CROSSCOMPILING AND NOT OPENMP_STANDALONE_BUILD) - # LLVM in-tree builds may use CMake target names to discover the tools. - set(CLANG_TOOL $) - set(LINK_TOOL $) - set(OPT_TOOL $) - libomptarget_say("Building AMDGCN device RTL. Using clang from in-tree build") -else() - libomptarget_say("Not building AMDGCN device RTL. No appropriate clang found") - return() -endif() - -project(omptarget-amdgcn) - -add_custom_target(omptarget-amdgcn ALL) - -#optimization level -set(optimization_level 2) - -# Activate RTL message dumps if requested by the user. -if(LIBOMPTARGET_NVPTX_DEBUG) - set(CUDA_DEBUG -DOMPTARGET_NVPTX_DEBUG=-1 -g) -endif() - -get_filename_component(devicertl_base_directory - ${CMAKE_CURRENT_SOURCE_DIR} - DIRECTORY) - -set(cuda_sources - ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_smid.hip - ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_locks.hip - ${CMAKE_CURRENT_SOURCE_DIR}/src/target_impl.hip - ${devicertl_base_directory}/common/src/cancel.cu - ${devicertl_base_directory}/common/src/critical.cu - ${devicertl_base_directory}/common/src/data_sharing.cu - ${devicertl_base_directory}/common/src/libcall.cu - ${devicertl_base_directory}/common/src/loop.cu - ${devicertl_base_directory}/common/src/omp_data.cu - ${devicertl_base_directory}/common/src/omptarget.cu - ${devicertl_base_directory}/common/src/parallel.cu - ${devicertl_base_directory}/common/src/reduction.cu - ${devicertl_base_directory}/common/src/support.cu - ${devicertl_base_directory}/common/src/shuffle.cpp - ${devicertl_base_directory}/common/src/sync.cu - ${devicertl_base_directory}/common/src/task.cu) - -set(h_files - ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_interface.h - ${CMAKE_CURRENT_SOURCE_DIR}/src/target_impl.h - ${devicertl_base_directory}/common/debug.h - ${devicertl_base_directory}/common/omptarget.h - ${devicertl_base_directory}/common/omptargeti.h - ${devicertl_base_directory}/common/state-queue.h - ${devicertl_base_directory}/common/state-queuei.h - ${devicertl_base_directory}/common/support.h) - -# for both in-tree and out-of-tree build -if (NOT CMAKE_ARCHIVE_OUTPUT_DIRECTORY) - set(OUTPUTDIR ${CMAKE_CURRENT_BINARY_DIR}) -else() - set(OUTPUTDIR ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY}) -endif() - -# create gfx bitcode libraries -set(mcpus gfx700 gfx701 gfx801 gfx803 gfx900 gfx902 gfx906 gfx908 gfx90a gfx1010 gfx1030 gfx1031) -if (DEFINED LIBOMPTARGET_AMDGCN_GFXLIST) - set(mcpus ${LIBOMPTARGET_AMDGCN_GFXLIST}) -endif() - -# Prepend -I to each list element -set (LIBOMPTARGET_LLVM_INCLUDE_DIRS_AMDGCN "${LIBOMPTARGET_LLVM_INCLUDE_DIRS}") -list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_AMDGCN PREPEND "-I") - -macro(add_cuda_bc_library) - set(cu_cmd ${CLANG_TOOL} - -xc++ - -c - -mllvm -openmp-opt-disable - -std=c++14 - -ffreestanding - -target amdgcn-amd-amdhsa - -emit-llvm - -Xclang -aux-triple -Xclang ${aux_triple} - -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device - -D__AMDGCN__ - -Xclang -target-cpu -Xclang ${mcpu} - -fvisibility=hidden - -Wno-unused-value - -nogpulib - -O${optimization_level} - ${CUDA_DEBUG} - -I${CMAKE_CURRENT_SOURCE_DIR}/src - -I${devicertl_base_directory}/common/include - -I${devicertl_base_directory} - -I${devicertl_base_directory}/../include - ${LIBOMPTARGET_LLVM_INCLUDE_DIRS_AMDGCN}) - - set(bc1_files) - - foreach(file ${ARGN}) - get_filename_component(fname ${file} NAME_WE) - set(bc1_filename ${fname}.${mcpu}.bc) - - add_custom_command( - OUTPUT ${bc1_filename} - COMMAND ${cu_cmd} ${file} -o ${bc1_filename} - DEPENDS ${file} ${h_files}) - - list(APPEND bc1_files ${bc1_filename}) - endforeach() - - add_custom_command( - OUTPUT linkout.cuda.${mcpu}.bc - COMMAND ${LINK_TOOL} ${bc1_files} -o linkout.cuda.${mcpu}.bc - DEPENDS ${bc1_files}) - - list(APPEND bc_files linkout.cuda.${mcpu}.bc) -endmacro() - -set(libname "omptarget-amdgcn") - -set(toolchain_deps "") -if(TARGET llvm-link) - list(APPEND toolchain_deps llvm-link) -endif() -if(TARGET opt) - list(APPEND toolchain_deps opt) -endif() - -foreach(mcpu ${mcpus}) - set(bc_files) - add_cuda_bc_library(${cuda_sources}) - - set(bc_libname lib${libname}-${mcpu}.bc) - add_custom_command( - OUTPUT ${bc_libname} - COMMAND ${LINK_TOOL} ${bc_files} | ${OPT_TOOL} --always-inline -o ${OUTPUTDIR}/${bc_libname} - DEPENDS ${bc_files} ${toolchain_deps}) - - add_custom_target(lib${libname}-${mcpu} ALL DEPENDS ${bc_libname}) - - install(FILES ${OUTPUTDIR}/${bc_libname} - DESTINATION "${OPENMP_INSTALL_LIBDIR}" - ) -endforeach() diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h +++ /dev/null @@ -1,19 +0,0 @@ -//===--- amdgcn_interface.h - OpenMP interface definitions ------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef _AMDGCN_INTERFACE_H_ -#define _AMDGCN_INTERFACE_H_ - -#include - -#define EXTERN extern "C" -typedef uint32_t omp_lock_t; /* arbitrary type of the right length */ - -EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads(); - -#endif diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip +++ /dev/null @@ -1,34 +0,0 @@ -//===-- amdgcn_locks.hip - AMDGCN OpenMP GPU lock implementation -- HIP -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// A 'thread' maps onto a lane of the wavefront. This means a per-thread lock -// cannot be implemented - if one thread gets the lock, it can't continue on to -// the next instruction in order to do anything as the other threads are waiting -// to take the lock. -// These functions will be implemented to provide the documented semantics for -// a SIMD => wavefront mapping once that is implemented. -// -//===----------------------------------------------------------------------===// -#pragma omp declare target - -#include "common/debug.h" - -static void warn() { - PRINT0(LD_ALL, "Locks are not supported in this thread mapping model"); -} - -void __kmpc_impl_init_lock(omp_lock_t *) { warn(); } -void __kmpc_impl_destroy_lock(omp_lock_t *) { warn(); } -void __kmpc_impl_set_lock(omp_lock_t *) { warn(); } -void __kmpc_impl_unset_lock(omp_lock_t *) { warn(); } -int __kmpc_impl_test_lock(omp_lock_t *lock) { - warn(); - return 0; -} - -#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip +++ /dev/null @@ -1,64 +0,0 @@ -//===-------- amdgcn_smid.hip - AMDGCN smid implementation -------- HIP -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#pragma omp declare target - -#include "target_impl.h" - -// Partially derived fom hcc_detail/device_functions.h - -// HW_ID Register bit structure -// WAVE_ID 3:0 Wave buffer slot number. 0-9. -// SIMD_ID 5:4 SIMD which the wave is assigned to within the CU. -// PIPE_ID 7:6 Pipeline from which the wave was dispatched. -// CU_ID 11:8 Compute Unit the wave is assigned to. -// SH_ID 12 Shader Array (within an SE) the wave is assigned to. -// SE_ID 14:13 Shader Engine the wave is assigned to. -// TG_ID 19:16 Thread-group ID -// VM_ID 23:20 Virtual Memory ID -// QUEUE_ID 26:24 Queue from which this wave was dispatched. -// STATE_ID 29:27 State ID (graphics only, not compute). -// ME_ID 31:30 Micro-engine ID. - -enum { - HW_ID = 4, // specify that the hardware register to read is HW_ID - - HW_ID_CU_ID_SIZE = 4, // size of CU_ID field in bits - HW_ID_CU_ID_OFFSET = 8, // offset of CU_ID from start of register - - HW_ID_SE_ID_SIZE = 2, // sizeof SE_ID field in bits - HW_ID_SE_ID_OFFSET = 13, // offset of SE_ID from start of register -}; - -// The s_getreg_b32 instruction, exposed as an intrinsic, takes a 16 bit -// immediate and returns a 32 bit value. -// The encoding of the immediate parameter is: -// ID 5:0 Which register to read from -// OFFSET 10:6 Range: 0..31 -// WIDTH 15:11 Range: 1..32 - -// The asm equivalent is s_getreg_b32 %0, hwreg(HW_REG_HW_ID, Offset, Width) -// where hwreg forms a 16 bit immediate encoded by the assembler thus: -// uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width) { -// return (Id << 0_) | (Offset << 6) | ((Width - 1) << 11); -// } -#define ENCODE_HWREG(WIDTH, OFF, REG) (REG | (OFF << 6) | ((WIDTH - 1) << 11)) - -// Note: The results can be changed by a context switch -// Return value in [0 2^SE_ID_SIZE * 2^CU_ID_SIZE), which is an upper -// bound on how many compute units are available. Some values in this -// range may never be returned if there are fewer than 2^CU_ID_SIZE CUs. - -EXTERN uint32_t __kmpc_impl_smid() { - uint32_t cu_id = __builtin_amdgcn_s_getreg( - ENCODE_HWREG(HW_ID_CU_ID_SIZE, HW_ID_CU_ID_OFFSET, HW_ID)); - uint32_t se_id = __builtin_amdgcn_s_getreg( - ENCODE_HWREG(HW_ID_SE_ID_SIZE, HW_ID_SE_ID_OFFSET, HW_ID)); - return (se_id << HW_ID_CU_ID_SIZE) + cu_id; -} - -#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h +++ /dev/null @@ -1,83 +0,0 @@ -//===------- target_impl.h - AMDGCN OpenMP GPU implementation ----- HIP -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Declarations and definitions of target specific functions and constants -// -//===----------------------------------------------------------------------===// -#ifndef OMPTARGET_AMDGCN_TARGET_IMPL_H -#define OMPTARGET_AMDGCN_TARGET_IMPL_H - -#ifndef __AMDGCN__ -#error "amdgcn target_impl.h expects to be compiled under __AMDGCN__" -#endif - -#include "amdgcn_interface.h" - -#include -#include - -// subset of inttypes.h -#define PRId64 "ld" -#define PRIu64 "lu" - -typedef uint64_t __kmpc_impl_lanemask_t; - -#define INLINE inline -#define NOINLINE __attribute__((noinline)) -#define ALIGN(N) __attribute__((aligned(N))) -#define PLUGIN_ACCESSIBLE \ - __attribute__((used)) /* Don't discard values the plugin reads */ \ - __attribute__((weak)) /* We may have multiple definitions */ \ - __attribute__((retain)) /* Also needed to keep values alive */ \ - __attribute__((visibility("protected"))) /* Access via SHT_HASH */ \ - __attribute__((section(".data"))) /* Not .bss, can write before load */ - -#include "llvm/Frontend/OpenMP/OMPGridValues.h" - -INLINE constexpr const llvm::omp::GV &getGridValue() { - return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>(); -} - -//////////////////////////////////////////////////////////////////////////////// -// Kernel options -//////////////////////////////////////////////////////////////////////////////// - -//////////////////////////////////////////////////////////////////////////////// -// The following def must match the absolute limit hardwired in the host RTL -// max number of threads per team -enum { MAX_THREADS_PER_TEAM = getGridValue().GV_Max_WG_Size }; -enum { WARPSIZE = getGridValue().GV_Warp_Size }; - -// Maximum number of omp state objects per SM allocated statically in global -// memory. -#define OMP_STATE_COUNT 32 -#define MAX_SM 64 - -#define OMP_ACTIVE_PARALLEL_LEVEL 128 - -// Data sharing related quantities, need to match what is used in the compiler. -enum DATA_SHARING_SIZES { - // The size reserved for data in a shared memory slot. - DS_Slot_Size = getGridValue().GV_Slot_Size, - // The slot size that should be reserved for a working warp. - DS_Worker_Warp_Slot_Size = getGridValue().warpSlotSize(), - // The maximum number of warps in use - DS_Max_Warp_Number = getGridValue().maxWarpNumber(), -}; - -enum : __kmpc_impl_lanemask_t { - __kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0 -}; - -// The return code of printf is not checked in the call sites in this library. -// A call to a function named printf currently hits some special case handling -// for opencl, which translates to calls that do not presently exist for openmp -// Therefore, for now, stub out printf while building this library. -#define printf(...) - -#endif diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip +++ /dev/null @@ -1,226 +0,0 @@ -//===------- target_impl.hip - AMDGCN OpenMP GPU implementation --- HIP -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Definitions of target specific functions -// -//===----------------------------------------------------------------------===// -#pragma omp declare target - -#include "common/omptarget.h" -#include "target_impl.h" -#include "target_interface.h" - -// Implementations initially derived from hcc - -// Initialized with a 64-bit mask with bits set in positions less than the -// thread's lane number in the warp -EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() { - uint32_t lane = GetLaneId(); - int64_t ballot = __kmpc_impl_activemask(); - uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1; - return mask & ballot; -} - -// Initialized with a 64-bit mask with bits set in positions greater than the -// thread's lane number in the warp -EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() { - uint32_t lane = GetLaneId(); - if (lane == (WARPSIZE - 1)) - return 0; - uint64_t ballot = __kmpc_impl_activemask(); - uint64_t mask = (~((uint64_t)0)) << (lane + 1); - return mask & ballot; -} - -EXTERN double __kmpc_impl_get_wtick() { return ((double)1E-9); } - -EXTERN double __kmpc_impl_get_wtime() { - // The intrinsics for measuring time have undocumented frequency - // This will probably need to be found by measurement on a number of - // architectures. Until then, return 0, which is very inaccurate as a - // timer but resolves the undefined symbol at link time. - return 0; -} - -// Warp vote function -EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() { - return __builtin_amdgcn_read_exec(); -} - -static void pteam_mem_barrier(uint32_t num_threads, uint32_t *barrier_state) { - __atomic_thread_fence(__ATOMIC_ACQUIRE); - - uint32_t num_waves = (num_threads + WARPSIZE - 1) / WARPSIZE; - - // Partial barrier implementation for amdgcn. - // Uses two 16 bit unsigned counters. One for the number of waves to have - // reached the barrier, and one to count how many times the barrier has been - // passed. These are packed in a single atomically accessed 32 bit integer. - // Low bits for the number of waves, assumed zero before this call. - // High bits to count the number of times the barrier has been passed. - - // precondition: num_waves != 0; - // invariant: num_waves * WARPSIZE == num_threads; - // precondition: num_waves < 0xffffu; - - // Increment the low 16 bits once, using the lowest active thread. - uint64_t lowestActiveThread = __kmpc_impl_ffs(__kmpc_impl_activemask()) - 1; - bool isLowest = GetLaneId() == lowestActiveThread; - - if (isLowest) { - uint32_t load = __atomic_fetch_add(barrier_state, 1, - __ATOMIC_RELAXED); // commutative - - // Record the number of times the barrier has been passed - uint32_t generation = load & 0xffff0000u; - - if ((load & 0x0000ffffu) == (num_waves - 1)) { - // Reached num_waves in low bits so this is the last wave. - // Set low bits to zero and increment high bits - load += 0x00010000u; // wrap is safe - load &= 0xffff0000u; // because bits zeroed second - - // Reset the wave counter and release the waiting waves - __atomic_store_n(barrier_state, load, __ATOMIC_RELAXED); - } else { - // more waves still to go, spin until generation counter changes - do { - __builtin_amdgcn_s_sleep(0); - load = __atomic_load_n(barrier_state, __ATOMIC_RELAXED); - } while ((load & 0xffff0000u) == generation); - } - } - __atomic_thread_fence(__ATOMIC_RELEASE); -} - -uint32_t __kmpc_L0_Barrier [[clang::loader_uninitialized]]; -#pragma allocate(__kmpc_L0_Barrier) allocator(omp_pteam_mem_alloc) - -EXTERN void __kmpc_impl_target_init() { - // Don't have global ctors, and shared memory is not zero init - __atomic_store_n(&__kmpc_L0_Barrier, 0u, __ATOMIC_RELEASE); -} - -EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) { - pteam_mem_barrier(num_threads, &__kmpc_L0_Barrier); -} - -namespace { -uint32_t get_grid_dim(uint32_t n, uint16_t d) { - uint32_t q = n / d; - return q + (n > q * d); -} -uint32_t get_workgroup_dim(uint32_t group_id, uint32_t grid_size, - uint16_t group_size) { - uint32_t r = grid_size - group_id * group_size; - return (r < group_size) ? r : group_size; -} -} // namespace - -EXTERN int __kmpc_get_hardware_num_blocks() { - return get_grid_dim(__builtin_amdgcn_grid_size_x(), - __builtin_amdgcn_workgroup_size_x()); -} - -EXTERN int __kmpc_get_hardware_num_threads_in_block() { - return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(), - __builtin_amdgcn_grid_size_x(), - __builtin_amdgcn_workgroup_size_x()); -} - -EXTERN unsigned __kmpc_get_warp_size() { - return WARPSIZE; -} - -EXTERN unsigned GetWarpId() { return __kmpc_get_hardware_thread_id_in_block() / WARPSIZE; } -EXTERN unsigned GetLaneId() { - return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); -} - -EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads() { - return __kmpc_get_hardware_num_threads_in_block(); -} - -// Atomics -uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) { - return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST); -} -uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) { - return __builtin_amdgcn_atomic_inc32(Address, Val, __ATOMIC_SEQ_CST, ""); -} -uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) { - return __atomic_fetch_max(Address, Val, __ATOMIC_SEQ_CST); -} - -uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) { - uint32_t R; - __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST); - return R; -} -uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare, uint32_t Val) { - (void)__atomic_compare_exchange(Address, &Compare, &Val, false, - __ATOMIC_SEQ_CST, __ATOMIC_RELAXED); - return Compare; -} - -unsigned long long __kmpc_atomic_exchange(unsigned long long *Address, - unsigned long long Val) { - unsigned long long R; - __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST); - return R; -} -unsigned long long __kmpc_atomic_add(unsigned long long *Address, - unsigned long long Val) { - return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST); -} - -// Stub implementations -// Weak to allow overriding by local versions while comparing different -// potential implementations -__attribute__((weak)) EXTERN void *__kmpc_impl_malloc(size_t) { - return nullptr; -} -__attribute__((weak)) EXTERN void __kmpc_impl_free(void *) {} - -EXTERN -int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t) { - return -1; -} - -EXTERN void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) { - lo = (uint32_t)(val & UINT64_C(0x00000000FFFFFFFF)); - hi = (uint32_t)((val & UINT64_C(0xFFFFFFFF00000000)) >> 32); -} - -EXTERN uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) { - return (((uint64_t)hi) << 32) | (uint64_t)lo; -} - -EXTERN void __kmpc_impl_syncthreads() { __builtin_amdgcn_s_barrier(); } - -EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t) { - // AMDGCN doesn't need to sync threads in a warp -} - -EXTERN void __kmpc_impl_threadfence() { - __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent"); -} - -EXTERN void __kmpc_impl_threadfence_block() { - __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup"); -} - -EXTERN void __kmpc_impl_threadfence_system() { - __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, ""); -} - -// Calls to the AMDGCN layer (assuming 1D layout) -EXTERN int __kmpc_get_hardware_thread_id_in_block() { return __builtin_amdgcn_workitem_id_x(); } -EXTERN int GetBlockIdInKernel() { return __builtin_amdgcn_workgroup_id_x(); } - -#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/allocator.h b/openmp/libomptarget/deviceRTLs/common/allocator.h deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/common/allocator.h +++ /dev/null @@ -1,44 +0,0 @@ -//===--------- allocator.h - OpenMP target memory allocator ------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Macros for allocating variables in different address spaces. -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_ALLOCATOR_H -#define OMPTARGET_ALLOCATOR_H - -#if _OPENMP -// Follows the pattern in interface.h -// Clang sema checks this type carefully, needs to closely match that from omp.h -typedef enum omp_allocator_handle_t { - omp_null_allocator = 0, - omp_default_mem_alloc = 1, - omp_large_cap_mem_alloc = 2, - omp_const_mem_alloc = 3, - omp_high_bw_mem_alloc = 4, - omp_low_lat_mem_alloc = 5, - omp_cgroup_mem_alloc = 6, - omp_pteam_mem_alloc = 7, - omp_thread_mem_alloc = 8, - KMP_ALLOCATOR_MAX_HANDLE = ~(0U) -} omp_allocator_handle_t; - -#define __PRAGMA(STR) _Pragma(#STR) -#define OMP_PRAGMA(STR) __PRAGMA(omp STR) - -#define SHARED(NAME) \ - NAME [[clang::loader_uninitialized]]; \ - OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc)) - -#define EXTERN_SHARED(NAME) \ - NAME; \ - OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc)) -#endif - -#endif // OMPTARGET_ALLOCATOR_H diff --git a/openmp/libomptarget/deviceRTLs/common/debug.h b/openmp/libomptarget/deviceRTLs/common/debug.h deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/common/debug.h +++ /dev/null @@ -1,293 +0,0 @@ -//===------------- debug.h - NVPTX OpenMP debug macros ----------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains debug macros to be used in the application. -// -// Usage guide -// -// PRINT0(flag, str) : if debug flag is on, print (no arguments) -// PRINT(flag, str, args) : if debug flag is on, print (arguments) -// DON(flag) : return true if debug flag is on -// -// ASSERT(flag, cond, str, args): if test flag is on, test the condition -// if the condition is false, print str+args -// and assert. -// CAUTION: cond may be evaluate twice -// AON(flag) : return true if test flag is on -// -// WARNING(flag, str, args) : if warning flag is on, print the warning -// WON(flag) : return true if warning flag is on -// -//===----------------------------------------------------------------------===// - -#ifndef _OMPTARGET_NVPTX_DEBUG_H_ -#define _OMPTARGET_NVPTX_DEBUG_H_ - -#include "target_interface.h" - -//////////////////////////////////////////////////////////////////////////////// -// set desired level of debugging -//////////////////////////////////////////////////////////////////////////////// - -#define LD_SET_NONE 0ULL /* none */ -#define LD_SET_ALL -1ULL /* all */ - -// pos 1 -#define LD_SET_LOOP 0x1ULL /* basic loop */ -#define LD_SET_LOOPD 0x2ULL /* basic loop */ -#define LD_SET_PAR 0x4ULL /* basic parallel */ -#define LD_SET_PARD 0x8ULL /* basic parallel */ - -// pos 2 -#define LD_SET_SYNC 0x10ULL /* sync info */ -#define LD_SET_SYNCD 0x20ULL /* sync info */ -#define LD_SET_WAIT 0x40ULL /* state when waiting */ -#define LD_SET_TASK 0x80ULL /* print task info (high level) */ - -// pos 3 -#define LD_SET_IO 0x100ULL /* big region io (excl atomic) */ -#define LD_SET_IOD 0x200ULL /* big region io (excl atomic) */ -#define LD_SET_ENV 0x400ULL /* env info */ -#define LD_SET_CANCEL 0x800ULL /* print cancel info */ - -// pos 4 -#define LD_SET_MEM 0x1000ULL /* malloc / free */ - -//////////////////////////////////////////////////////////////////////////////// -// set the desired flags to print selected output. - -// these are some examples of possible definitions that can be used for -// debugging. -//#define OMPTARGET_NVPTX_DEBUG (LD_SET_ALL) -//#define OMPTARGET_NVPTX_DEBUG (LD_SET_LOOP) // limit to loop printfs to save -// on cuda buffer -//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO) -//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO | LD_SET_ENV) -//#define OMPTARGET_NVPTX_DEBUG (LD_SET_PAR) - -#ifndef OMPTARGET_NVPTX_DEBUG -#define OMPTARGET_NVPTX_DEBUG LD_SET_NONE -#elif OMPTARGET_NVPTX_DEBUG -#warning debug is used, not good for measurements -#endif - -//////////////////////////////////////////////////////////////////////////////// -// set desired level of asserts -//////////////////////////////////////////////////////////////////////////////// - -//////////////////////////////////////////////////////////////////////////////// -// available flags - -#define LT_SET_NONE 0x0 /* unsafe */ -#define LT_SET_SAFETY \ - 0x1 /* check malloc type of stuff, input at creation, cheap */ -#define LT_SET_INPUT 0x2 /* check also all runtime inputs */ -#define LT_SET_FUSSY 0x4 /* fussy checks, expensive */ - -//////////////////////////////////////////////////////////////////////////////// -// set the desired flags - -#ifndef OMPTARGET_NVPTX_TEST -#if OMPTARGET_NVPTX_DEBUG -#define OMPTARGET_NVPTX_TEST (LT_SET_FUSSY) -#else -#define OMPTARGET_NVPTX_TEST (LT_SET_SAFETY) -#endif -#endif - -//////////////////////////////////////////////////////////////////////////////// -// set desired level of warnings -//////////////////////////////////////////////////////////////////////////////// - -//////////////////////////////////////////////////////////////////////////////// -// available flags - -#define LW_SET_ALL -1 -#define LW_SET_NONE 0x0 -#define LW_SET_ENV 0x1 -#define LW_SET_INPUT 0x2 -#define LW_SET_FUSSY 0x4 - -//////////////////////////////////////////////////////////////////////////////// -// set the desired flags - -#if OMPTARGET_NVPTX_DEBUG -#define OMPTARGET_NVPTX_WARNING (LW_SET_NONE) -#else -#define OMPTARGET_NVPTX_WARNING (LW_SET_FUSSY) -#endif - -//////////////////////////////////////////////////////////////////////////////// -// implementation for debug -//////////////////////////////////////////////////////////////////////////////// - -#if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING -#include "common/support.h" - -template -NOINLINE static void log(const char *fmt, Arguments... parameters) { - printf(fmt, (int)GetBlockIdInKernel(), - (int)__kmpc_get_hardware_thread_id_in_block(), (int)GetWarpId(), - (int)GetLaneId(), parameters...); -} - -#endif -#if OMPTARGET_NVPTX_TEST - -template -NOINLINE static void check(bool cond, const char *fmt, - Arguments... parameters) { - if (!cond) { - printf(fmt, (int)GetBlockIdInKernel(), - (int)__kmpc_get_hardware_thread_id_in_block(), (int)GetWarpId(), - (int)GetLaneId(), parameters...); - __builtin_trap(); - } -} - -NOINLINE static void check(bool cond) { - if (!cond) - __builtin_trap(); -} -#endif - -// set flags that are tested (inclusion properties) - -#define LD_ALL (LD_SET_ALL) - -#define LD_LOOP (LD_SET_LOOP | LD_SET_LOOPD) -#define LD_LOOPD (LD_SET_LOOPD) -#define LD_PAR (LD_SET_PAR | LD_SET_PARD) -#define LD_PARD (LD_SET_PARD) - -// pos 2 -#define LD_SYNC (LD_SET_SYNC | LD_SET_SYNCD) -#define LD_SYNCD (LD_SET_SYNCD) -#define LD_WAIT (LD_SET_WAIT) -#define LD_TASK (LD_SET_TASK) - -// pos 3 -#define LD_IO (LD_SET_IO | LD_SET_IOD) -#define LD_IOD (LD_SET_IOD) -#define LD_ENV (LD_SET_ENV) -#define LD_CANCEL (LD_SET_CANCEL) - -// pos 3 -#define LD_MEM (LD_SET_MEM) - -// implement -#if OMPTARGET_NVPTX_DEBUG - -#define DON(_flag) ((unsigned)(OMPTARGET_NVPTX_DEBUG) & (_flag)) - -#define PRINT0(_flag, _str) \ - { \ - if (omptarget_device_environment.debug_level && DON(_flag)) { \ - log(": " _str); \ - } \ - } - -#define PRINT(_flag, _str, _args...) \ - { \ - if (omptarget_device_environment.debug_level && DON(_flag)) { \ - log(": " _str, _args); \ - } \ - } -#else - -#define DON(_flag) (0) -#define PRINT0(flag, str) -#define PRINT(flag, str, _args...) - -#endif - -// for printing without worrying about precision, pointers... -#define P64(_x) ((unsigned long long)(_x)) - -//////////////////////////////////////////////////////////////////////////////// -// early defs for test -//////////////////////////////////////////////////////////////////////////////// - -#define LT_SAFETY (LT_SET_SAFETY | LT_SET_INPUT | LT_SET_FUSSY) -#define LT_INPUT (LT_SET_INPUT | LT_SET_FUSSY) -#define LT_FUSSY (LT_SET_FUSSY) - -#if OMPTARGET_NVPTX_TEST == LT_SET_SAFETY - -#define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag)) -#define ASSERT0(_flag, _cond, _str) \ - { \ - if (TON(_flag)) { \ - check(_cond); \ - } \ - } -#define ASSERT(_flag, _cond, _str, _args...) \ - { \ - if (TON(_flag)) { \ - check(_cond); \ - } \ - } - -#elif OMPTARGET_NVPTX_TEST >= LT_SET_INPUT - -#define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag)) -#define ASSERT0(_flag, _cond, _str) \ - { \ - if (TON(_flag)) { \ - check((_cond), " ASSERT: " _str "\n"); \ - } \ - } -#define ASSERT(_flag, _cond, _str, _args...) \ - { \ - if (TON(_flag)) { \ - check((_cond), " ASSERT: " _str "\n", \ - _args); \ - } \ - } - -#else - -#define TON(_flag) (0) -#define ASSERT0(_flag, _cond, _str) -#define ASSERT(_flag, _cond, _str, _args...) - -#endif - -//////////////////////////////////////////////////////////////////////////////// -// early defs for warning - -#define LW_ALL (LW_SET_ALL) -#define LW_ENV (LW_SET_FUSSY | LW_SET_INPUT | LW_SET_ENV) -#define LW_INPUT (LW_SET_FUSSY | LW_SET_INPUT) -#define LW_FUSSY (LW_SET_FUSSY) - -#if OMPTARGET_NVPTX_WARNING - -#define WON(_flag) ((OMPTARGET_NVPTX_WARNING) & (_flag)) -#define WARNING0(_flag, _str) \ - { \ - if (WON(_flag)) { \ - log(" WARNING: " _str); \ - } \ - } -#define WARNING(_flag, _str, _args...) \ - { \ - if (WON(_flag)) { \ - log(" WARNING: " _str, _args); \ - } \ - } - -#else - -#define WON(_flag) (0) -#define WARNING0(_flag, _str) -#define WARNING(_flag, _str, _args...) - -#endif - -#endif diff --git a/openmp/libomptarget/deviceRTLs/common/generated_microtask_cases.gen b/openmp/libomptarget/deviceRTLs/common/generated_microtask_cases.gen deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/common/generated_microtask_cases.gen +++ /dev/null @@ -1,405 +0,0 @@ -case 0: -((void (*)(kmp_int32 *, kmp_int32 * -))fn)(&global_tid, &bound_tid -); -break; -case 1: -((void (*)(kmp_int32 *, kmp_int32 * -, void *))fn)(&global_tid, &bound_tid -, args[0]); -break; -case 2: -((void (*)(kmp_int32 *, kmp_int32 * -, void *, void *))fn)(&global_tid, &bound_tid -, args[0], args[1]); -break; -case 3: -((void (*)(kmp_int32 *, kmp_int32 * -, void *, void *, void *))fn)(&global_tid, &bound_tid -, args[0], args[1], args[2]); -break; -case 4: -((void (*)(kmp_int32 *, kmp_int32 * -, void *, void *, void *, void * -))fn)(&global_tid, &bound_tid -, args[0], args[1], args[2], args[3] -); -break; -case 5: -((void (*)(kmp_int32 *, kmp_int32 * -, void *, void *, void *, void * -, void *))fn)(&global_tid, &bound_tid -, args[0], args[1], args[2], args[3] -, args[4]); -break; -case 6: -((void (*)(kmp_int32 *, kmp_int32 * -, void *, void *, void *, void * -, void *, void *))fn)(&global_tid, &bound_tid -, args[0], args[1], args[2], args[3] -, args[4], args[5]); -break; -case 7: -((void (*)(kmp_int32 *, kmp_int32 * -, void *, void *, void *, void * -, void *, void *, void *))fn)(&global_tid, &bound_tid -, args[0], args[1], args[2], args[3] -, args[4], args[5], args[6]); -break; -case 8: -((void (*)(kmp_int32 *, kmp_int32 * -, void *, void *, void *, void * -, void *, void *, void *, void * -))fn)(&global_tid, &bound_tid -, args[0], args[1], args[2], args[3] -, args[4], args[5], args[6], args[7] -); -break; -case 9: -((void (*)(kmp_int32 *, kmp_int32 * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *))fn)(&global_tid, &bound_tid -, args[0], args[1], args[2], args[3] -, args[4], args[5], args[6], args[7] -, args[8]); -break; -case 10: -((void (*)(kmp_int32 *, kmp_int32 * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *))fn)(&global_tid, &bound_tid -, args[0], args[1], args[2], args[3] -, args[4], args[5], args[6], args[7] -, args[8], args[9]); -break; -case 11: -((void (*)(kmp_int32 *, kmp_int32 * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *))fn)(&global_tid, &bound_tid -, args[0], args[1], args[2], args[3] -, args[4], args[5], args[6], args[7] -, args[8], args[9], args[10]); -break; -case 12: -((void (*)(kmp_int32 *, kmp_int32 * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -))fn)(&global_tid, &bound_tid -, args[0], args[1], args[2], args[3] -, args[4], args[5], args[6], args[7] -, args[8], args[9], args[10], args[11] -); -break; -case 13: -((void (*)(kmp_int32 *, kmp_int32 * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *))fn)(&global_tid, &bound_tid -, args[0], args[1], args[2], args[3] -, args[4], args[5], args[6], args[7] -, args[8], args[9], args[10], args[11] -, args[12]); -break; -case 14: -((void (*)(kmp_int32 *, kmp_int32 * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *))fn)(&global_tid, &bound_tid -, args[0], args[1], args[2], args[3] -, args[4], args[5], args[6], args[7] -, args[8], args[9], args[10], args[11] -, args[12], args[13]); -break; -case 15: -((void (*)(kmp_int32 *, kmp_int32 * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *))fn)(&global_tid, &bound_tid -, args[0], args[1], args[2], args[3] -, args[4], args[5], args[6], args[7] -, args[8], args[9], args[10], args[11] -, args[12], args[13], args[14]); -break; -case 16: -((void (*)(kmp_int32 *, kmp_int32 * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -))fn)(&global_tid, &bound_tid -, args[0], args[1], args[2], args[3] -, args[4], args[5], args[6], args[7] -, args[8], args[9], args[10], args[11] -, args[12], args[13], args[14], args[15] -); -break; -case 17: -((void (*)(kmp_int32 *, kmp_int32 * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *))fn)(&global_tid, &bound_tid -, args[0], args[1], args[2], args[3] -, args[4], args[5], args[6], args[7] -, args[8], args[9], args[10], args[11] -, args[12], args[13], args[14], args[15] -, args[16]); -break; -case 18: -((void (*)(kmp_int32 *, kmp_int32 * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *))fn)(&global_tid, &bound_tid -, args[0], args[1], args[2], args[3] -, args[4], args[5], args[6], args[7] -, args[8], args[9], args[10], args[11] -, args[12], args[13], args[14], args[15] -, args[16], args[17]); -break; -case 19: -((void (*)(kmp_int32 *, kmp_int32 * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *))fn)(&global_tid, &bound_tid -, args[0], args[1], args[2], args[3] -, args[4], args[5], args[6], args[7] -, args[8], args[9], args[10], args[11] -, args[12], args[13], args[14], args[15] -, args[16], args[17], args[18]); -break; -case 20: -((void (*)(kmp_int32 *, kmp_int32 * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -))fn)(&global_tid, &bound_tid -, args[0], args[1], args[2], args[3] -, args[4], args[5], args[6], args[7] -, args[8], args[9], args[10], args[11] -, args[12], args[13], args[14], args[15] -, args[16], args[17], args[18], args[19] -); -break; -case 21: -((void (*)(kmp_int32 *, kmp_int32 * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *))fn)(&global_tid, &bound_tid -, args[0], args[1], args[2], args[3] -, args[4], args[5], args[6], args[7] -, args[8], args[9], args[10], args[11] -, args[12], args[13], args[14], args[15] -, args[16], args[17], args[18], args[19] -, args[20]); -break; -case 22: -((void (*)(kmp_int32 *, kmp_int32 * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *))fn)(&global_tid, &bound_tid -, args[0], args[1], args[2], args[3] -, args[4], args[5], args[6], args[7] -, args[8], args[9], args[10], args[11] -, args[12], args[13], args[14], args[15] -, args[16], args[17], args[18], args[19] -, args[20], args[21]); -break; -case 23: -((void (*)(kmp_int32 *, kmp_int32 * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *))fn)(&global_tid, &bound_tid -, args[0], args[1], args[2], args[3] -, args[4], args[5], args[6], args[7] -, args[8], args[9], args[10], args[11] -, args[12], args[13], args[14], args[15] -, args[16], args[17], args[18], args[19] -, args[20], args[21], args[22]); -break; -case 24: -((void (*)(kmp_int32 *, kmp_int32 * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -))fn)(&global_tid, &bound_tid -, args[0], args[1], args[2], args[3] -, args[4], args[5], args[6], args[7] -, args[8], args[9], args[10], args[11] -, args[12], args[13], args[14], args[15] -, args[16], args[17], args[18], args[19] -, args[20], args[21], args[22], args[23] -); -break; -case 25: -((void (*)(kmp_int32 *, kmp_int32 * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *))fn)(&global_tid, &bound_tid -, args[0], args[1], args[2], args[3] -, args[4], args[5], args[6], args[7] -, args[8], args[9], args[10], args[11] -, args[12], args[13], args[14], args[15] -, args[16], args[17], args[18], args[19] -, args[20], args[21], args[22], args[23] -, args[24]); -break; -case 26: -((void (*)(kmp_int32 *, kmp_int32 * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *))fn)(&global_tid, &bound_tid -, args[0], args[1], args[2], args[3] -, args[4], args[5], args[6], args[7] -, args[8], args[9], args[10], args[11] -, args[12], args[13], args[14], args[15] -, args[16], args[17], args[18], args[19] -, args[20], args[21], args[22], args[23] -, args[24], args[25]); -break; -case 27: -((void (*)(kmp_int32 *, kmp_int32 * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *))fn)(&global_tid, &bound_tid -, args[0], args[1], args[2], args[3] -, args[4], args[5], args[6], args[7] -, args[8], args[9], args[10], args[11] -, args[12], args[13], args[14], args[15] -, args[16], args[17], args[18], args[19] -, args[20], args[21], args[22], args[23] -, args[24], args[25], args[26]); -break; -case 28: -((void (*)(kmp_int32 *, kmp_int32 * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -))fn)(&global_tid, &bound_tid -, args[0], args[1], args[2], args[3] -, args[4], args[5], args[6], args[7] -, args[8], args[9], args[10], args[11] -, args[12], args[13], args[14], args[15] -, args[16], args[17], args[18], args[19] -, args[20], args[21], args[22], args[23] -, args[24], args[25], args[26], args[27] -); -break; -case 29: -((void (*)(kmp_int32 *, kmp_int32 * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *))fn)(&global_tid, &bound_tid -, args[0], args[1], args[2], args[3] -, args[4], args[5], args[6], args[7] -, args[8], args[9], args[10], args[11] -, args[12], args[13], args[14], args[15] -, args[16], args[17], args[18], args[19] -, args[20], args[21], args[22], args[23] -, args[24], args[25], args[26], args[27] -, args[28]); -break; -case 30: -((void (*)(kmp_int32 *, kmp_int32 * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *))fn)(&global_tid, &bound_tid -, args[0], args[1], args[2], args[3] -, args[4], args[5], args[6], args[7] -, args[8], args[9], args[10], args[11] -, args[12], args[13], args[14], args[15] -, args[16], args[17], args[18], args[19] -, args[20], args[21], args[22], args[23] -, args[24], args[25], args[26], args[27] -, args[28], args[29]); -break; -case 31: -((void (*)(kmp_int32 *, kmp_int32 * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *))fn)(&global_tid, &bound_tid -, args[0], args[1], args[2], args[3] -, args[4], args[5], args[6], args[7] -, args[8], args[9], args[10], args[11] -, args[12], args[13], args[14], args[15] -, args[16], args[17], args[18], args[19] -, args[20], args[21], args[22], args[23] -, args[24], args[25], args[26], args[27] -, args[28], args[29], args[30]); -break; -case 32: -((void (*)(kmp_int32 *, kmp_int32 * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -, void *, void *, void *, void * -))fn)(&global_tid, &bound_tid -, args[0], args[1], args[2], args[3] -, args[4], args[5], args[6], args[7] -, args[8], args[9], args[10], args[11] -, args[12], args[13], args[14], args[15] -, args[16], args[17], args[18], args[19] -, args[20], args[21], args[22], args[23] -, args[24], args[25], args[26], args[27] -, args[28], args[29], args[30], args[31] -); -break; \ No newline at end of file diff --git a/openmp/libomptarget/deviceRTLs/common/include/target.h b/openmp/libomptarget/deviceRTLs/common/include/target.h deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/common/include/target.h +++ /dev/null @@ -1,94 +0,0 @@ -//===-- target.h ---------- OpenMP device runtime target implementation ---===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Target region interfaces are simple interfaces designed to allow middle-end -// (=LLVM) passes to analyze and transform the code. To achieve good performance -// it may be required to run the associated passes. However, implementations of -// this interface shall always provide a correct implementation as close to the -// user expected code as possible. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_OPENMP_LIBOMPTARGET_DEVICERTLS_COMMON_TARGET_H -#define LLVM_OPENMP_LIBOMPTARGET_DEVICERTLS_COMMON_TARGET_H - -#include - -extern "C" { - -/// Forward declaration of the source location identifier "ident". -typedef struct ident ident_t; - -/// The target region _kernel_ interface for GPUs -/// -/// This deliberatly simple interface provides the middle-end (=LLVM) with -/// easier means to reason about the semantic of the code and transform it as -/// well. The runtime calls are therefore also desiged to carry sufficient -/// information necessary for optimizations. -/// -/// -/// Intended usage: -/// -/// \code -/// void kernel(...) { -/// ThreadKind = __kmpc_target_init(Ident, /* Mode */ 1, -/// /* UseGenericStateMachine */ true, -/// /* RequiresFullRuntime */ ... ); -/// if (ThreadKind == -1) { -/// // User defined kernel code. -/// } -/// __kmpc_target_deinit(...); -/// } -/// \endcode -/// -/// Which can be transformed to: -/// -/// \code -/// void kernel(...) { -/// ThreadKind = __kmpc_target_init(Ident, /* Mode */ 1, -/// /* UseGenericStateMachine */ false, -/// /* RequiresFullRuntime */ ... ); -/// if (ThreadKind == -1) { -/// // User defined kernel code. -/// } else { -/// assume(ThreadKind == ThreadId); -/// // Custom, kernel-specific state machine code. -/// } -/// __kmpc_target_deinit(...); -/// } -/// \endcode -/// -/// -///{ - -/// Initialization -/// -/// Must be called by all threads. -/// -/// \param Ident Source location identification, can be NULL. -/// -int32_t __kmpc_target_init(ident_t *Ident, int8_t Mode, - bool UseGenericStateMachine, - bool RequiresFullRuntime); - -/// De-Initialization -/// -/// Must be called by the main thread in generic mode, can be called by all -/// threads. Must be called by all threads in SPMD mode. -/// -/// In non-SPMD, this function releases the workers trapped in a state machine -/// and also any memory dynamically allocated by the runtime. -/// -/// \param Ident Source location identification, can be NULL. -/// -void __kmpc_target_deinit(ident_t *Ident, int8_t Mode, - bool RequiresFullRuntime); - -///} -} -#endif diff --git a/openmp/libomptarget/deviceRTLs/common/include/target/shuffle.h b/openmp/libomptarget/deviceRTLs/common/include/target/shuffle.h deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/common/include/target/shuffle.h +++ /dev/null @@ -1,102 +0,0 @@ -//===- shuffle.h - OpenMP variants of the shuffle idiom for all targets -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Shuffle function implementations for all supported targets. -// -// Note: We unify the mask type to uint64_t instead of __kmpc_impl_lanemask_t. -// -//===----------------------------------------------------------------------===// - -#ifndef LIBOMPTARGET_DEVICERTL_SHUFFLE_H -#define LIBOMPTARGET_DEVICERTL_SHUFFLE_H - -#include - -#pragma omp declare target - -/// External shuffle API -/// -///{ - -extern "C" { -int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size); -int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size); -} - -///} - -/// Forward declarations -/// -///{ -extern "C" { -unsigned GetLaneId(); -unsigned __kmpc_get_warp_size(); -void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi); -uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi); -} -///} - -/// Fallback implementations of the shuffle sync idiom. -/// Unavailable at present (would error at link time if used). -/// -///{ - -int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var, int32_t SrcLane); - -int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var, uint32_t Delta, - int32_t Width); - -///} - -/// AMDGCN implementations of the shuffle sync idiom. -/// -///{ -#pragma omp begin declare variant match(device = {arch(amdgcn)}) - -inline int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var, - int32_t SrcLane) { - int Width = __kmpc_get_warp_size(); - int Self = GetLaneId(); - int Index = SrcLane + (Self & ~(Width - 1)); - return __builtin_amdgcn_ds_bpermute(Index << 2, Var); -} - -inline int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var, - uint32_t LaneDelta, int32_t Width) { - int Self = GetLaneId(); - int Index = Self + LaneDelta; - Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index; - return __builtin_amdgcn_ds_bpermute(Index << 2, Var); -} - -#pragma omp end declare variant -///} - -/// NVPTX implementations of the shuffle and shuffle sync idiom. -/// -///{ -#pragma omp begin declare variant match( \ - device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)}) - -inline int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var, - int32_t SrcLane) { - return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f); -} - -inline int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var, - uint32_t Delta, int32_t Width) { - int32_t T = ((__kmpc_get_warp_size() - Width) << 8) | 0x1f; - return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T); -} - -#pragma omp end declare variant -///} - -#pragma omp end declare target - -#endif diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/common/omptarget.h +++ /dev/null @@ -1,282 +0,0 @@ -//===---- omptarget.h - OpenMP GPU initialization ---------------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the declarations of all library macros, types, -// and functions. -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_H -#define OMPTARGET_H - -#include "common/allocator.h" -#include "common/debug.h" // debug -#include "common/state-queue.h" -#include "common/support.h" -#include "interface.h" // interfaces with omp, compiler, and user -#include "target_impl.h" - -#define OMPTARGET_NVPTX_VERSION 1.1 - -// used by the library for the interface with the app -#define DISPATCH_FINISHED 0 -#define DISPATCH_NOTFINISHED 1 - -// used by dynamic scheduling -#define FINISHED 0 -#define NOT_FINISHED 1 -#define LAST_CHUNK 2 - -#define BARRIER_COUNTER 0 -#define ORDERED_COUNTER 1 - -// Worker slot type which is initialized with the default worker slot -// size of 4*32 bytes. -struct __kmpc_data_sharing_slot { - __kmpc_data_sharing_slot *Next; - __kmpc_data_sharing_slot *Prev; - void *PrevSlotStackPtr; - void *DataEnd; - char Data[DS_Worker_Warp_Slot_Size]; -}; - -//////////////////////////////////////////////////////////////////////////////// -// task ICV and (implicit & explicit) task state - -class omptarget_nvptx_TaskDescr { -public: - // methods for flags - INLINE omp_sched_t GetRuntimeSched() const; - INLINE void SetRuntimeSched(omp_sched_t sched); - INLINE int InParallelRegion() const { return items.flags & TaskDescr_InPar; } - INLINE int InL2OrHigherParallelRegion() const { - return items.flags & TaskDescr_InParL2P; - } - INLINE int IsParallelConstruct() const { - return items.flags & TaskDescr_IsParConstr; - } - INLINE int IsTaskConstruct() const { return !IsParallelConstruct(); } - // methods for other fields - INLINE uint16_t &ThreadId() { return items.threadId; } - INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; } - INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; } - INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) { - prev = taskDescr; - } - // init & copy - INLINE void InitLevelZeroTaskDescr(); - INLINE void InitLevelOneTaskDescr(omptarget_nvptx_TaskDescr *parentTaskDescr); - INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr); - INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr); - INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr); - INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr); - INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr); - INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr); - INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr, - uint16_t tid, uint16_t tnum); - INLINE void SaveLoopData(); - INLINE void RestoreLoopData() const; - -private: - // bits for flags: (6 used, 2 free) - // 3 bits (SchedMask) for runtime schedule - // 1 bit (InPar) if this thread has encountered one or more parallel region - // 1 bit (IsParConstr) if ICV for a parallel region (false = explicit task) - // 1 bit (InParL2+) if this thread has encountered L2 or higher parallel - // region - static const uint8_t TaskDescr_SchedMask = (0x1 | 0x2 | 0x4); - static const uint8_t TaskDescr_InPar = 0x10; - static const uint8_t TaskDescr_IsParConstr = 0x20; - static const uint8_t TaskDescr_InParL2P = 0x40; - - struct SavedLoopDescr_items { - int64_t loopUpperBound; - int64_t nextLowerBound; - int64_t chunk; - int64_t stride; - kmp_sched_t schedule; - } loopData; - - struct TaskDescr_items { - uint8_t flags; // 6 bit used (see flag above) - uint8_t unused; - uint16_t threadId; // thread id - uint64_t runtimeChunkSize; // runtime chunk size - } items; - omptarget_nvptx_TaskDescr *prev; -}; - -// build on kmp -typedef struct omptarget_nvptx_ExplicitTaskDescr { - omptarget_nvptx_TaskDescr - taskDescr; // omptarget_nvptx task description (must be first) - kmp_TaskDescr kmpTaskDescr; // kmp task description (must be last) -} omptarget_nvptx_ExplicitTaskDescr; - -//////////////////////////////////////////////////////////////////////////////// -// Descriptor of a parallel region (worksharing in general) - -class omptarget_nvptx_WorkDescr { - -public: - // access to data - INLINE omptarget_nvptx_TaskDescr *WorkTaskDescr() { return &masterTaskICV; } - -private: - omptarget_nvptx_TaskDescr masterTaskICV; -}; - -//////////////////////////////////////////////////////////////////////////////// - -class omptarget_nvptx_TeamDescr { -public: - // access to data - INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() { - return &levelZeroTaskDescr; - } - INLINE omptarget_nvptx_WorkDescr &WorkDescr() { - return workDescrForActiveParallel; - } - - // init - INLINE void InitTeamDescr(); - - INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) { - worker_rootS[wid].DataEnd = - &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size; - // We currently do not have a next slot. - worker_rootS[wid].Next = 0; - worker_rootS[wid].Prev = 0; - worker_rootS[wid].PrevSlotStackPtr = 0; - return (__kmpc_data_sharing_slot *)&worker_rootS[wid]; - } - -private: - omptarget_nvptx_TaskDescr - levelZeroTaskDescr; // icv for team master initial thread - omptarget_nvptx_WorkDescr - workDescrForActiveParallel; // one, ONLY for the active par - - ALIGN(16) - __kmpc_data_sharing_slot worker_rootS[DS_Max_Warp_Number]; -}; - -//////////////////////////////////////////////////////////////////////////////// -// thread private data (struct of arrays for better coalescing) -// tid refers here to the global thread id -// do not support multiple concurrent kernel a this time -class omptarget_nvptx_ThreadPrivateContext { -public: - // task - INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) { - return &levelOneTaskDescr[tid]; - } - INLINE void SetTopLevelTaskDescr(int tid, - omptarget_nvptx_TaskDescr *taskICV) { - topTaskDescr[tid] = taskICV; - } - INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid) const; - // schedule (for dispatch) - INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; } - INLINE int64_t &Chunk(int tid) { return chunk[tid]; } - INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; } - INLINE int64_t &NextLowerBound(int tid) { return nextLowerBound[tid]; } - INLINE int64_t &Stride(int tid) { return stride[tid]; } - - INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; } - - INLINE void InitThreadPrivateContext(int tid); - INLINE uint64_t &Cnt() { return cnt; } - -private: - // team context for this team - omptarget_nvptx_TeamDescr teamContext; - // task ICV for implicit threads in the only parallel region - omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM]; - // pointer where to find the current task ICV (top of the stack) - omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM]; - // schedule (for dispatch) - kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for - int64_t chunk[MAX_THREADS_PER_TEAM]; - int64_t loopUpperBound[MAX_THREADS_PER_TEAM]; - // state for dispatch with dyn/guided OR static (never use both at a time) - int64_t nextLowerBound[MAX_THREADS_PER_TEAM]; - int64_t stride[MAX_THREADS_PER_TEAM]; - uint64_t cnt; -}; - -/// Memory manager for statically allocated memory. -class omptarget_nvptx_SimpleMemoryManager { -private: - struct MemDataTy { - volatile unsigned keys[OMP_STATE_COUNT]; - } MemData[MAX_SM] ALIGN(128); - - INLINE static uint32_t hash(unsigned key) { - return key & (OMP_STATE_COUNT - 1); - } - -public: - INLINE void Release(); - INLINE const void *Acquire(const void *buf, size_t size); -}; - -//////////////////////////////////////////////////////////////////////////////// - -//////////////////////////////////////////////////////////////////////////////// -// global data tables -//////////////////////////////////////////////////////////////////////////////// - -extern omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager; -extern uint32_t EXTERN_SHARED(usedMemIdx); -extern uint32_t EXTERN_SHARED(usedSlotIdx); -#if _OPENMP -extern uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; -#pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc) -#else -extern uint8_t EXTERN_SHARED(parallelLevel)[MAX_THREADS_PER_TEAM / WARPSIZE]; -#endif -extern uint16_t EXTERN_SHARED(threadLimit); -extern uint16_t EXTERN_SHARED(threadsInTeam); -extern uint16_t EXTERN_SHARED(nThreads); -extern omptarget_nvptx_ThreadPrivateContext * - EXTERN_SHARED(omptarget_nvptx_threadPrivateContext); - -extern int8_t EXTERN_SHARED(execution_param); -extern void *EXTERN_SHARED(ReductionScratchpadPtr); - -//////////////////////////////////////////////////////////////////////////////// -// work function (outlined parallel/simd functions) and arguments. -// needed for L1 parallelism only. -//////////////////////////////////////////////////////////////////////////////// - -typedef void *omptarget_nvptx_WorkFn; -extern omptarget_nvptx_WorkFn EXTERN_SHARED(omptarget_nvptx_workFn); - -//////////////////////////////////////////////////////////////////////////////// -// get private data structures -//////////////////////////////////////////////////////////////////////////////// - -INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor(); -INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor(); -INLINE omptarget_nvptx_TaskDescr * -getMyTopTaskDescriptor(bool isSPMDExecutionMode); -INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId); - -//////////////////////////////////////////////////////////////////////////////// -// inlined implementation -//////////////////////////////////////////////////////////////////////////////// - -INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __builtin_ffs(x); } -INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __builtin_popcount(x); } -INLINE uint32_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); } -INLINE uint32_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); } - -#include "common/omptargeti.h" - -#endif diff --git a/openmp/libomptarget/deviceRTLs/common/omptargeti.h b/openmp/libomptarget/deviceRTLs/common/omptargeti.h deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/common/omptargeti.h +++ /dev/null @@ -1,223 +0,0 @@ -//===---- omptargeti.h - OpenMP GPU initialization --------------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the declarations of all library macros, types, -// and functions. -// -//===----------------------------------------------------------------------===// - -//////////////////////////////////////////////////////////////////////////////// -// Task Descriptor -//////////////////////////////////////////////////////////////////////////////// - -INLINE omp_sched_t omptarget_nvptx_TaskDescr::GetRuntimeSched() const { - // sched starts from 1..4; encode it as 0..3; so add 1 here - uint8_t rc = (items.flags & TaskDescr_SchedMask) + 1; - return (omp_sched_t)rc; -} - -INLINE void omptarget_nvptx_TaskDescr::SetRuntimeSched(omp_sched_t sched) { - // sched starts from 1..4; encode it as 0..3; so sub 1 here - uint8_t val = ((uint8_t)sched) - 1; - // clear current sched - items.flags &= ~TaskDescr_SchedMask; - // set new sched - items.flags |= val; -} - -INLINE void omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() { - // slow method - // flag: - // default sched is static, - // dyn is off (unused now anyway, but may need to sample from host ?) - // not in parallel - - items.flags = 0; - items.threadId = 0; // is master - items.runtimeChunkSize = 1; // preferred chunking statik with chunk 1 -} - -// This is called when all threads are started together in SPMD mode. -// OMP directives include target parallel, target distribute parallel for, etc. -INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr( - omptarget_nvptx_TaskDescr *parentTaskDescr) { - // slow method - // flag: - // default sched is static, - // dyn is off (unused now anyway, but may need to sample from host ?) - // in L1 parallel - - items.flags = TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel - items.threadId = - __kmpc_get_hardware_thread_id_in_block(); // get ids from cuda (only - // called for 1st level) - items.runtimeChunkSize = 1; // preferred chunking statik with chunk 1 - prev = parentTaskDescr; -} - -INLINE void omptarget_nvptx_TaskDescr::CopyData( - omptarget_nvptx_TaskDescr *sourceTaskDescr) { - items = sourceTaskDescr->items; -} - -INLINE void -omptarget_nvptx_TaskDescr::Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr) { - CopyData(sourceTaskDescr); - prev = sourceTaskDescr->prev; -} - -INLINE void omptarget_nvptx_TaskDescr::CopyParent( - omptarget_nvptx_TaskDescr *parentTaskDescr) { - CopyData(parentTaskDescr); - prev = parentTaskDescr; -} - -INLINE void omptarget_nvptx_TaskDescr::CopyForExplicitTask( - omptarget_nvptx_TaskDescr *parentTaskDescr) { - CopyParent(parentTaskDescr); - items.flags = items.flags & ~TaskDescr_IsParConstr; - ASSERT0(LT_FUSSY, IsTaskConstruct(), "expected task"); -} - -INLINE void omptarget_nvptx_TaskDescr::CopyToWorkDescr( - omptarget_nvptx_TaskDescr *masterTaskDescr) { - CopyParent(masterTaskDescr); - // overwrite specific items; - items.flags |= - TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel -} - -INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr( - omptarget_nvptx_TaskDescr *workTaskDescr) { - Copy(workTaskDescr); - // - // overwrite specific items; - // - // The threadID should be __kmpc_get_hardware_thread_id_in_block() % - // GetMasterThreadID(). This is so that the serial master (first lane in the - // master warp) gets a threadId of 0. However, we know that this function is - // always called in a parallel region where only workers are active. The - // serial master thread never enters this region. When a parallel region is - // executed serially, the threadId is set to 0 elsewhere and the - // kmpc_serialized_* functions are called, which never activate this region. - items.threadId = - __kmpc_get_hardware_thread_id_in_block(); // get ids from cuda (only - // called for 1st level) -} - -INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent( - omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum) { - CopyParent(parentTaskDescr); - items.flags |= TaskDescr_InParL2P; // In L2+ parallelism - items.threadId = tid; -} - -INLINE void omptarget_nvptx_TaskDescr::SaveLoopData() { - loopData.loopUpperBound = - omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId); - loopData.nextLowerBound = - omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId); - loopData.schedule = - omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId); - loopData.chunk = omptarget_nvptx_threadPrivateContext->Chunk(items.threadId); - loopData.stride = - omptarget_nvptx_threadPrivateContext->Stride(items.threadId); -} - -INLINE void omptarget_nvptx_TaskDescr::RestoreLoopData() const { - omptarget_nvptx_threadPrivateContext->Chunk(items.threadId) = loopData.chunk; - omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId) = - loopData.loopUpperBound; - omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId) = - loopData.nextLowerBound; - omptarget_nvptx_threadPrivateContext->Stride(items.threadId) = - loopData.stride; - omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId) = - loopData.schedule; -} - -//////////////////////////////////////////////////////////////////////////////// -// Thread Private Context -//////////////////////////////////////////////////////////////////////////////// - -INLINE omptarget_nvptx_TaskDescr * -omptarget_nvptx_ThreadPrivateContext::GetTopLevelTaskDescr(int tid) const { - ASSERT0( - LT_FUSSY, tid < MAX_THREADS_PER_TEAM, - "Getting top level, tid is larger than allocated data structure size"); - return topTaskDescr[tid]; -} - -INLINE void -omptarget_nvptx_ThreadPrivateContext::InitThreadPrivateContext(int tid) { - // levelOneTaskDescr is init when starting the parallel region - // top task descr is NULL (team master version will be fixed separately) - topTaskDescr[tid] = NULL; - // the following don't need to be init here; they are init when using dyn - // sched - // current_Event, events_Number, chunk, num_Iterations, schedule -} - -//////////////////////////////////////////////////////////////////////////////// -// Team Descriptor -//////////////////////////////////////////////////////////////////////////////// - -INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr() { - levelZeroTaskDescr.InitLevelZeroTaskDescr(); -} - -//////////////////////////////////////////////////////////////////////////////// -// Get private data structure for thread -//////////////////////////////////////////////////////////////////////////////// - -// Utility routines for CUDA threads -INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor() { - return omptarget_nvptx_threadPrivateContext->TeamContext(); -} - -INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor() { - omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor(); - return currTeamDescr.WorkDescr(); -} - -INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int threadId) { - return omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); -} - -INLINE omptarget_nvptx_TaskDescr * -getMyTopTaskDescriptor(bool isSPMDExecutionMode) { - return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock()); -} - -//////////////////////////////////////////////////////////////////////////////// -// Memory management runtime functions. -//////////////////////////////////////////////////////////////////////////////// - -INLINE void omptarget_nvptx_SimpleMemoryManager::Release() { - ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM, - "SlotIdx is too big or uninitialized."); - ASSERT0(LT_FUSSY, usedMemIdx < OMP_STATE_COUNT, - "MemIdx is too big or uninitialized."); - MemDataTy &MD = MemData[usedSlotIdx]; - __kmpc_atomic_exchange((unsigned *)&MD.keys[usedMemIdx], 0u); -} - -INLINE const void *omptarget_nvptx_SimpleMemoryManager::Acquire(const void *buf, - size_t size) { - ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM, - "SlotIdx is too big or uninitialized."); - const unsigned sm = usedSlotIdx; - MemDataTy &MD = MemData[sm]; - unsigned i = hash(GetBlockIdInKernel()); - while (__kmpc_atomic_cas((unsigned *)&MD.keys[i], 0u, 1u) != 0) { - i = hash(i + 1); - } - usedSlotIdx = sm; - usedMemIdx = i; - return static_cast(buf) + (sm * OMP_STATE_COUNT + i) * size; -} diff --git a/openmp/libomptarget/deviceRTLs/common/src/cancel.cu b/openmp/libomptarget/deviceRTLs/common/src/cancel.cu deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/cancel.cu +++ /dev/null @@ -1,31 +0,0 @@ -//===------ cancel.cu - NVPTX OpenMP cancel interface ------------ CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Interface to be used in the implementation of OpenMP cancel. -// -//===----------------------------------------------------------------------===// -#pragma omp declare target - -#include "common/debug.h" -#include "interface.h" - -EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid, - int32_t cancelVal) { - PRINT(LD_IO, "call kmpc_cancellationpoint(cancel val %d)\n", (int)cancelVal); - // disabled - return 0; -} - -EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid, - int32_t cancelVal) { - PRINT(LD_IO, "call kmpc_cancel(cancel val %d)\n", (int)cancelVal); - // disabled - return 0; -} - -#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/critical.cu b/openmp/libomptarget/deviceRTLs/common/src/critical.cu deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/critical.cu +++ /dev/null @@ -1,31 +0,0 @@ -//===------ critical.cu - NVPTX OpenMP critical ------------------ CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the implementation of critical with KMPC interface -// -//===----------------------------------------------------------------------===// -#pragma omp declare target - -#include "common/debug.h" -#include "interface.h" - -EXTERN -void __kmpc_critical(kmp_Ident *loc, int32_t global_tid, - kmp_CriticalName *lck) { - PRINT0(LD_IO, "call to kmpc_critical()\n"); - omp_set_lock((omp_lock_t *)lck); -} - -EXTERN -void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid, - kmp_CriticalName *lck) { - PRINT0(LD_IO, "call to kmpc_end_critical()\n"); - omp_unset_lock((omp_lock_t *)lck); -} - -#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu +++ /dev/null @@ -1,194 +0,0 @@ -//===----- data_sharing.cu - OpenMP GPU data sharing ------------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the implementation of data sharing environments -// -//===----------------------------------------------------------------------===// -#pragma omp declare target - -#include "common/omptarget.h" -#include "target/shuffle.h" -#include "target_impl.h" - -//////////////////////////////////////////////////////////////////////////////// -// Runtime functions for trunk data sharing scheme. -//////////////////////////////////////////////////////////////////////////////// - -static constexpr unsigned MinBytes = 8; - -static constexpr unsigned Alignment = 8; - -/// External symbol to access dynamic shared memory. -extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment))); -#pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc) - -EXTERN void *__kmpc_get_dynamic_shared() { return DynamicSharedBuffer; } - -EXTERN void *llvm_omp_get_dynamic_shared() { - return __kmpc_get_dynamic_shared(); -} - -template -struct alignas(32) ThreadStackTy { - static constexpr unsigned BytesPerThread = BPerThread; - static constexpr unsigned NumThreads = NThreads; - static constexpr unsigned NumWarps = (NThreads + WARPSIZE - 1) / WARPSIZE; - - unsigned char Data[NumThreads][BytesPerThread]; - unsigned char Usage[NumThreads]; -}; - -[[clang::loader_uninitialized]] ThreadStackTy MainSharedStack; -#pragma omp allocate(MainSharedStack) allocator(omp_pteam_mem_alloc) - -[[clang::loader_uninitialized]] ThreadStackTy - WorkerSharedStack; -#pragma omp allocate(WorkerSharedStack) allocator(omp_pteam_mem_alloc) - -EXTERN void *__kmpc_alloc_shared(size_t Bytes) { - size_t AlignedBytes = Bytes + (Bytes % MinBytes); - int TID = __kmpc_get_hardware_thread_id_in_block(); - if (__kmpc_is_generic_main_thread(TID)) { - // Main thread alone, use shared memory if space is available. - if (MainSharedStack.Usage[0] + AlignedBytes <= MainSharedStack.BytesPerThread) { - void *Ptr = &MainSharedStack.Data[0][MainSharedStack.Usage[0]]; - MainSharedStack.Usage[0] += AlignedBytes; - return Ptr; - } - } else if (TID < WorkerSharedStack.NumThreads) { - if (WorkerSharedStack.Usage[TID] + AlignedBytes <= WorkerSharedStack.BytesPerThread) { - void *Ptr = &WorkerSharedStack.Data[TID][WorkerSharedStack.Usage[TID]]; - WorkerSharedStack.Usage[TID] += AlignedBytes; - return Ptr; - } - } - // Fallback to malloc - return SafeMalloc(Bytes, "AllocGlobalFallback"); -} - -EXTERN void __kmpc_free_shared(void *Ptr, size_t Bytes) { - size_t AlignedBytes = Bytes + (Bytes % MinBytes); - int TID = __kmpc_get_hardware_thread_id_in_block(); - if (__kmpc_is_generic_main_thread(TID)) { - if (Ptr >= &MainSharedStack.Data[0][0] && - Ptr < &MainSharedStack.Data[MainSharedStack.NumThreads][0]) { - MainSharedStack.Usage[0] -= AlignedBytes; - return; - } - } else if (TID < WorkerSharedStack.NumThreads) { - if (Ptr >= &WorkerSharedStack.Data[0][0] && - Ptr < &WorkerSharedStack.Data[WorkerSharedStack.NumThreads][0]) { - int TID = __kmpc_get_hardware_thread_id_in_block(); - WorkerSharedStack.Usage[TID] -= AlignedBytes; - return; - } - } - SafeFree(Ptr, "FreeGlobalFallback"); -} - -EXTERN void __kmpc_data_sharing_init_stack() { - for (unsigned i = 0; i < MainSharedStack.NumWarps; ++i) - MainSharedStack.Usage[i] = 0; - for (unsigned i = 0; i < WorkerSharedStack.NumThreads; ++i) - WorkerSharedStack.Usage[i] = 0; -} - -/// Allocate storage in shared memory to communicate arguments from the main -/// thread to the workers in generic mode. If we exceed -/// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication. -#define NUM_SHARED_VARIABLES_IN_SHARED_MEM 64 - -[[clang::loader_uninitialized]] static void - *SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM]; -#pragma omp allocate(SharedMemVariableSharingSpace) \ - allocator(omp_pteam_mem_alloc) -[[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr; -#pragma omp allocate(SharedMemVariableSharingSpacePtr) \ - allocator(omp_pteam_mem_alloc) - -// Begin a data sharing context. Maintain a list of references to shared -// variables. This list of references to shared variables will be passed -// to one or more threads. -// In L0 data sharing this is called by master thread. -// In L1 data sharing this is called by active warp master thread. -EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) { - if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) { - SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0]; - } else { - SharedMemVariableSharingSpacePtr = - (void **)SafeMalloc(nArgs * sizeof(void *), "new extended args"); - } - *GlobalArgs = SharedMemVariableSharingSpacePtr; -} - -// End a data sharing context. There is no need to have a list of refs -// to shared variables because the context in which those variables were -// shared has now ended. This should clean-up the list of references only -// without affecting the actual global storage of the variables. -// In L0 data sharing this is called by master thread. -// In L1 data sharing this is called by active warp master thread. -EXTERN void __kmpc_end_sharing_variables() { - if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0]) - SafeFree(SharedMemVariableSharingSpacePtr, "new extended args"); -} - -// This function will return a list of references to global variables. This -// is how the workers will get a reference to the globalized variable. The -// members of this list will be passed to the outlined parallel function -// preserving the order. -// Called by all workers. -EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) { - *GlobalArgs = SharedMemVariableSharingSpacePtr; -} - -// This function is used to init static memory manager. This manager is used to -// manage statically allocated global memory. This memory is allocated by the -// compiler and used to correctly implement globalization of the variables in -// target, teams and distribute regions. -EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode, - const void *buf, size_t size, - int16_t is_shared, - const void **frame) { - if (is_shared) { - *frame = buf; - return; - } - if (isSPMDExecutionMode) { - if (__kmpc_get_hardware_thread_id_in_block() == 0) { - *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size); - } - __kmpc_impl_syncthreads(); - return; - } - ASSERT0(LT_FUSSY, - __kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(), - "Must be called only in the target master thread."); - *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size); - __kmpc_impl_threadfence(); -} - -EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode, - int16_t is_shared) { - if (is_shared) - return; - if (isSPMDExecutionMode) { - __kmpc_impl_syncthreads(); - if (__kmpc_get_hardware_thread_id_in_block() == 0) { - omptarget_nvptx_simpleMemoryManager.Release(); - } - return; - } - __kmpc_impl_threadfence(); - ASSERT0(LT_FUSSY, - __kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(), - "Must be called only in the target master thread."); - omptarget_nvptx_simpleMemoryManager.Release(); -} - -#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu +++ /dev/null @@ -1,359 +0,0 @@ -//===------------ libcall.cu - OpenMP GPU user calls ------------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file implements the OpenMP runtime functions that can be -// invoked by the user in an OpenMP region -// -//===----------------------------------------------------------------------===// -#pragma omp declare target - -#include "common/omptarget.h" -#include "target_impl.h" - -EXTERN double omp_get_wtick(void) { - double rc = __kmpc_impl_get_wtick(); - PRINT(LD_IO, "omp_get_wtick() returns %g\n", rc); - return rc; -} - -EXTERN double omp_get_wtime(void) { - double rc = __kmpc_impl_get_wtime(); - PRINT(LD_IO, "call omp_get_wtime() returns %g\n", rc); - return rc; -} - -EXTERN void omp_set_num_threads(int num) { - // Ignore it for SPMD mode. - if (__kmpc_is_spmd_exec_mode()) - return; - ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime."); - PRINT(LD_IO, "call omp_set_num_threads(num %d)\n", num); - if (num <= 0) { - WARNING0(LW_INPUT, "expected positive num; ignore\n"); - } else if (parallelLevel[GetWarpId()] == 0) { - nThreads = num; - } -} - -EXTERN int omp_get_num_threads(void) { - int rc = GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode()); - PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc); - return rc; -} - -EXTERN int omp_get_max_threads(void) { - if (parallelLevel[GetWarpId()] > 0) - // We're already in parallel region. - return 1; // default is 1 thread avail - // Not currently in a parallel region, return what was set. - int rc = 1; - if (parallelLevel[GetWarpId()] == 0) - rc = nThreads; - ASSERT0(LT_FUSSY, rc >= 0, "bad number of threads"); - PRINT(LD_IO, "call omp_get_max_threads() return %d\n", rc); - return rc; -} - -EXTERN int omp_get_thread_limit(void) { - if (__kmpc_is_spmd_exec_mode()) - return __kmpc_get_hardware_num_threads_in_block(); - int rc = threadLimit; - PRINT(LD_IO, "call omp_get_thread_limit() return %d\n", rc); - return rc; -} - -EXTERN int omp_get_thread_num() { - int rc = GetOmpThreadId(); - PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc); - return rc; -} - -EXTERN int omp_get_num_procs(void) { - int rc = GetNumberOfProcsInDevice(__kmpc_is_spmd_exec_mode()); - PRINT(LD_IO, "call omp_get_num_procs() returns %d\n", rc); - return rc; -} - -EXTERN int omp_in_parallel(void) { - int rc = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0; - PRINT(LD_IO, "call omp_in_parallel() returns %d\n", rc); - return rc; -} - -EXTERN int omp_in_final(void) { - // treat all tasks as final... Specs may expect runtime to keep - // track more precisely if a task was actively set by users... This - // is not explicitly specified; will treat as if runtime can - // actively decide to put a non-final task into a final one. - int rc = 1; - PRINT(LD_IO, "call omp_in_final() returns %d\n", rc); - return rc; -} - -EXTERN void omp_set_dynamic(int flag) { - PRINT(LD_IO, "call omp_set_dynamic(%d) is ignored (no support)\n", flag); -} - -EXTERN int omp_get_dynamic(void) { - int rc = 0; - PRINT(LD_IO, "call omp_get_dynamic() returns %d\n", rc); - return rc; -} - -EXTERN void omp_set_nested(int flag) { - PRINT(LD_IO, "call omp_set_nested(%d) is ignored (no nested support)\n", - flag); -} - -EXTERN int omp_get_nested(void) { - int rc = 0; - PRINT(LD_IO, "call omp_get_nested() returns %d\n", rc); - return rc; -} - -EXTERN void omp_set_max_active_levels(int level) { - PRINT(LD_IO, - "call omp_set_max_active_levels(%d) is ignored (no nested support)\n", - level); -} - -EXTERN int omp_get_max_active_levels(void) { - int rc = 1; - PRINT(LD_IO, "call omp_get_max_active_levels() returns %d\n", rc); - return rc; -} - -EXTERN int omp_get_level(void) { - int level = __kmpc_parallel_level(); - PRINT(LD_IO, "call omp_get_level() returns %d\n", level); - return level; -} - -EXTERN int omp_get_active_level(void) { - int level = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0; - PRINT(LD_IO, "call omp_get_active_level() returns %d\n", level) - return level; -} - -EXTERN int omp_get_ancestor_thread_num(int level) { - if (__kmpc_is_spmd_exec_mode()) - return level == 1 ? __kmpc_get_hardware_thread_id_in_block() : 0; - int rc = -1; - // If level is 0 or all parallel regions are not active - return 0. - unsigned parLevel = parallelLevel[GetWarpId()]; - if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) { - int totLevel = omp_get_level(); - if (level <= totLevel) { - omptarget_nvptx_TaskDescr *currTaskDescr = - getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false); - int steps = totLevel - level; - PRINT(LD_IO, "backtrack %d steps\n", steps); - ASSERT0(LT_FUSSY, currTaskDescr, - "do not expect fct to be called in a non-active thread"); - do { - if (DON(LD_IOD)) { - // print current state - omp_sched_t sched = currTaskDescr->GetRuntimeSched(); - PRINT(LD_ALL, - "task descr %s %d: %s, in par %d, rt sched %d," - " chunk %" PRIu64 "; tid %d, tnum %d, nthreads %d\n", - "ancestor", steps, - (currTaskDescr->IsParallelConstruct() ? "par" : "task"), - (int)currTaskDescr->InParallelRegion(), (int)sched, - currTaskDescr->RuntimeChunkSize(), - (int)currTaskDescr->ThreadId(), (int)threadsInTeam, - (int)nThreads); - } - - if (currTaskDescr->IsParallelConstruct()) { - // found the level - if (!steps) { - rc = currTaskDescr->ThreadId(); - break; - } - steps--; - } - currTaskDescr = currTaskDescr->GetPrevTaskDescr(); - } while (currTaskDescr); - ASSERT0(LT_FUSSY, !steps, "expected to find all steps"); - } - } else if (level == 0 || - (level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL && - level <= parLevel) || - (level > 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL && - level <= (parLevel - OMP_ACTIVE_PARALLEL_LEVEL))) { - rc = 0; - } - PRINT(LD_IO, "call omp_get_ancestor_thread_num(level %d) returns %d\n", level, - rc) - return rc; -} - -EXTERN int omp_get_team_size(int level) { - if (__kmpc_is_spmd_exec_mode()) - return level == 1 ? __kmpc_get_hardware_num_threads_in_block() : 1; - int rc = -1; - unsigned parLevel = parallelLevel[GetWarpId()]; - // If level is 0 or all parallel regions are not active - return 1. - if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) { - rc = threadsInTeam; - } else if (level == 0 || - (level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL && - level <= parLevel) || - (level > 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL && - level <= (parLevel - OMP_ACTIVE_PARALLEL_LEVEL))) { - rc = 1; - } - PRINT(LD_IO, "call omp_get_team_size(level %d) returns %d\n", level, rc) - return rc; -} - -EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier) { - if (isRuntimeUninitialized()) { - ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(), - "Expected SPMD mode only with uninitialized runtime."); - *kind = omp_sched_static; - *modifier = 1; - } else { - omptarget_nvptx_TaskDescr *currTaskDescr = - getMyTopTaskDescriptor(__kmpc_is_spmd_exec_mode()); - *kind = currTaskDescr->GetRuntimeSched(); - *modifier = currTaskDescr->RuntimeChunkSize(); - } - PRINT(LD_IO, "call omp_get_schedule returns sched %d and modif %d\n", - (int)*kind, *modifier); -} - -EXTERN void omp_set_schedule(omp_sched_t kind, int modifier) { - PRINT(LD_IO, "call omp_set_schedule(sched %d, modif %d)\n", (int)kind, - modifier); - if (isRuntimeUninitialized()) { - ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(), - "Expected SPMD mode only with uninitialized runtime."); - return; - } - if (kind >= omp_sched_static && kind < omp_sched_auto) { - omptarget_nvptx_TaskDescr *currTaskDescr = - getMyTopTaskDescriptor(__kmpc_is_spmd_exec_mode()); - currTaskDescr->SetRuntimeSched(kind); - currTaskDescr->RuntimeChunkSize() = modifier; - PRINT(LD_IOD, "omp_set_schedule did set sched %d & modif %" PRIu64 "\n", - (int)currTaskDescr->GetRuntimeSched(), - currTaskDescr->RuntimeChunkSize()); - } -} - -EXTERN omp_proc_bind_t omp_get_proc_bind(void) { - PRINT0(LD_IO, "call omp_get_proc_bin() is true, regardless on state\n"); - return omp_proc_bind_true; -} - -EXTERN int omp_get_num_places(void) { - PRINT0(LD_IO, "call omp_get_num_places() returns 0\n"); - return 0; -} - -EXTERN int omp_get_place_num_procs(int place_num) { - PRINT0(LD_IO, "call omp_get_place_num_procs() returns 0\n"); - return 0; -} - -EXTERN void omp_get_place_proc_ids(int place_num, int *ids) { - PRINT0(LD_IO, "call to omp_get_place_proc_ids()\n"); -} - -EXTERN int omp_get_place_num(void) { - PRINT0(LD_IO, "call to omp_get_place_num() returns 0\n"); - return 0; -} - -EXTERN int omp_get_partition_num_places(void) { - PRINT0(LD_IO, "call to omp_get_partition_num_places() returns 0\n"); - return 0; -} - -EXTERN void omp_get_partition_place_nums(int *place_nums) { - PRINT0(LD_IO, "call to omp_get_partition_place_nums()\n"); -} - -EXTERN int omp_get_cancellation(void) { - int rc = 0; - PRINT(LD_IO, "call omp_get_cancellation() returns %d\n", rc); - return rc; -} - -EXTERN void omp_set_default_device(int deviceId) { - PRINT0(LD_IO, "call omp_get_default_device() is undef on device\n"); -} - -EXTERN int omp_get_default_device(void) { - PRINT0(LD_IO, - "call omp_get_default_device() is undef on device, returns 0\n"); - return 0; -} - -EXTERN int omp_get_num_devices(void) { - PRINT0(LD_IO, "call omp_get_num_devices() is undef on device, returns 0\n"); - return 0; -} - -EXTERN int omp_get_num_teams(void) { - int rc = GetNumberOfOmpTeams(); - PRINT(LD_IO, "call omp_get_num_teams() returns %d\n", rc); - return rc; -} - -EXTERN int omp_get_team_num() { - int rc = GetOmpTeamId(); - PRINT(LD_IO, "call omp_get_team_num() returns %d\n", rc); - return rc; -} - -// Unspecified on the device. -EXTERN int omp_get_initial_device(void) { - PRINT0(LD_IO, "call omp_get_initial_device() returns 0\n"); - return 0; -} - -// Unused for now. -EXTERN int omp_get_max_task_priority(void) { - PRINT0(LD_IO, "call omp_get_max_task_priority() returns 0\n"); - return 0; -} - -//////////////////////////////////////////////////////////////////////////////// -// locks -//////////////////////////////////////////////////////////////////////////////// - -EXTERN void omp_init_lock(omp_lock_t *lock) { - __kmpc_impl_init_lock(lock); - PRINT0(LD_IO, "call omp_init_lock()\n"); -} - -EXTERN void omp_destroy_lock(omp_lock_t *lock) { - __kmpc_impl_destroy_lock(lock); - PRINT0(LD_IO, "call omp_destroy_lock()\n"); -} - -EXTERN void omp_set_lock(omp_lock_t *lock) { - __kmpc_impl_set_lock(lock); - PRINT0(LD_IO, "call omp_set_lock()\n"); -} - -EXTERN void omp_unset_lock(omp_lock_t *lock) { - __kmpc_impl_unset_lock(lock); - PRINT0(LD_IO, "call omp_unset_lock()\n"); -} - -EXTERN int omp_test_lock(omp_lock_t *lock) { - int rc = __kmpc_impl_test_lock(lock); - PRINT(LD_IO, "call omp_test_lock() return %d\n", rc); - return rc; -} - -#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/loop.cu b/openmp/libomptarget/deviceRTLs/common/src/loop.cu deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/loop.cu +++ /dev/null @@ -1,813 +0,0 @@ -//===------------ loop.cu - NVPTX OpenMP loop constructs --------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the implementation of the KMPC interface -// for the loop construct plus other worksharing constructs that use the same -// interface as loops. -// -//===----------------------------------------------------------------------===// -#pragma omp declare target - -#include "common/omptarget.h" -#include "target/shuffle.h" -#include "target_impl.h" - -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// -// template class that encapsulate all the helper functions -// -// T is loop iteration type (32 | 64) (unsigned | signed) -// ST is the signed version of T -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// - -template class omptarget_nvptx_LoopSupport { -public: - //////////////////////////////////////////////////////////////////////////////// - // Loop with static scheduling with chunk - - // Generic implementation of OMP loop scheduling with static policy - /*! \brief Calculate initial bounds for static loop and stride - * @param[in] loc location in code of the call (not used here) - * @param[in] global_tid global thread id - * @param[in] schetype type of scheduling (see omptarget-nvptx.h) - * @param[in] plastiter pointer to last iteration - * @param[in,out] pointer to loop lower bound. it will contain value of - * lower bound of first chunk - * @param[in,out] pointer to loop upper bound. It will contain value of - * upper bound of first chunk - * @param[in,out] pointer to loop stride. It will contain value of stride - * between two successive chunks executed by the same thread - * @param[in] loop increment bump - * @param[in] chunk size - */ - - // helper function for static chunk - INLINE static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, - ST chunk, T entityId, T numberOfEntities) { - // each thread executes multiple chunks all of the same size, except - // the last one - - // distance between two successive chunks - stride = numberOfEntities * chunk; - lb = lb + entityId * chunk; - T inputUb = ub; - ub = lb + chunk - 1; // Clang uses i <= ub - // Say ub' is the begining of the last chunk. Then who ever has a - // lower bound plus a multiple of the increment equal to ub' is - // the last one. - T beginingLastChunk = inputUb - (inputUb % chunk); - last = ((beginingLastChunk - lb) % stride) == 0; - } - - //////////////////////////////////////////////////////////////////////////////// - // Loop with static scheduling without chunk - - // helper function for static no chunk - INLINE static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, - ST &chunk, T entityId, - T numberOfEntities) { - // No chunk size specified. Each thread or warp gets at most one - // chunk; chunks are all almost of equal size - T loopSize = ub - lb + 1; - - chunk = loopSize / numberOfEntities; - T leftOver = loopSize - chunk * numberOfEntities; - - if (entityId < leftOver) { - chunk++; - lb = lb + entityId * chunk; - } else { - lb = lb + entityId * chunk + leftOver; - } - - T inputUb = ub; - ub = lb + chunk - 1; // Clang uses i <= ub - last = lb <= inputUb && inputUb <= ub; - stride = loopSize; // make sure we only do 1 chunk per warp - } - - //////////////////////////////////////////////////////////////////////////////// - // Support for Static Init - - INLINE static void for_static_init(int32_t gtid, int32_t schedtype, - int32_t *plastiter, T *plower, T *pupper, - ST *pstride, ST chunk, - bool IsSPMDExecutionMode) { - // When IsRuntimeUninitialized is true, we assume that the caller is - // in an L0 parallel region and that all worker threads participate. - - // Assume we are in teams region or that we use a single block - // per target region - ST numberOfActiveOMPThreads = GetNumberOfOmpThreads(IsSPMDExecutionMode); - - // All warps that are in excess of the maximum requested, do - // not execute the loop - PRINT(LD_LOOP, - "OMP Thread %d: schedule type %d, chunk size = %lld, mytid " - "%d, num tids %d\n", - (int)gtid, (int)schedtype, (long long)chunk, (int)gtid, - (int)numberOfActiveOMPThreads); - ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads, - "current thread is not needed here; error"); - - // copy - int lastiter = 0; - T lb = *plower; - T ub = *pupper; - ST stride = *pstride; - // init - switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) { - case kmp_sched_static_chunk: { - if (chunk > 0) { - ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, - numberOfActiveOMPThreads); - break; - } - } // note: if chunk <=0, use nochunk - case kmp_sched_static_balanced_chunk: { - if (chunk > 0) { - // round up to make sure the chunk is enough to cover all iterations - T tripCount = ub - lb + 1; // +1 because ub is inclusive - T span = (tripCount + numberOfActiveOMPThreads - 1) / - numberOfActiveOMPThreads; - // perform chunk adjustment - chunk = (span + chunk - 1) & ~(chunk - 1); - - ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb."); - T oldUb = ub; - ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, - numberOfActiveOMPThreads); - if (ub > oldUb) - ub = oldUb; - break; - } - } // note: if chunk <=0, use nochunk - case kmp_sched_static_nochunk: { - ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid, - numberOfActiveOMPThreads); - break; - } - case kmp_sched_distr_static_chunk: { - if (chunk > 0) { - ForStaticChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(), - GetNumberOfOmpTeams()); - break; - } // note: if chunk <=0, use nochunk - } - case kmp_sched_distr_static_nochunk: { - ForStaticNoChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(), - GetNumberOfOmpTeams()); - break; - } - case kmp_sched_distr_static_chunk_sched_static_chunkone: { - ForStaticChunk(lastiter, lb, ub, stride, chunk, - numberOfActiveOMPThreads * GetOmpTeamId() + gtid, - GetNumberOfOmpTeams() * numberOfActiveOMPThreads); - break; - } - default: { - ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype); - PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n", - (int)schedtype); - ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, - numberOfActiveOMPThreads); - break; - } - } - // copy back - *plastiter = lastiter; - *plower = lb; - *pupper = ub; - *pstride = stride; - PRINT(LD_LOOP, - "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld, last " - "%d\n", - (int)numberOfActiveOMPThreads, (int)GetNumberOfWorkersInTeam(), - (long long)(*plower), (long long)(*pupper), (long long)(*pstride), - (int)lastiter); - } - - //////////////////////////////////////////////////////////////////////////////// - // Support for dispatch Init - - INLINE static int OrderedSchedule(kmp_sched_t schedule) { - return schedule >= kmp_sched_ordered_first && - schedule <= kmp_sched_ordered_last; - } - - INLINE static void dispatch_init(kmp_Ident *loc, int32_t threadId, - kmp_sched_t schedule, T lb, T ub, ST st, - ST chunk) { - if (isRuntimeUninitialized()) { - // In SPMD mode no need to check parallelism level - dynamic scheduling - // may appear only in L2 parallel regions with lightweight runtime. - ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(), "Expected non-SPMD mode."); - return; - } - int tid = GetLogicalThreadIdInBlock(); - omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid); - T tnum = GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode()); - T tripCount = ub - lb + 1; // +1 because ub is inclusive - ASSERT0(LT_FUSSY, threadId < tnum, - "current thread is not needed here; error"); - - /* Currently just ignore the monotonic and non-monotonic modifiers - * (the compiler isn't producing them * yet anyway). - * When it is we'll want to look at them somewhere here and use that - * information to add to our schedule choice. We shouldn't need to pass - * them on, they merely affect which schedule we can legally choose for - * various dynamic cases. (In particular, whether or not a stealing scheme - * is legal). - */ - schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); - - // Process schedule. - if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) { - if (OrderedSchedule(schedule)) - __kmpc_barrier(loc, threadId); - PRINT(LD_LOOP, - "go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n", - (long)tnum, (long long)tripCount, (int)schedule); - schedule = kmp_sched_static_chunk; - chunk = tripCount; // one thread gets the whole loop - } else if (schedule == kmp_sched_runtime) { - // process runtime - omp_sched_t rtSched = currTaskDescr->GetRuntimeSched(); - chunk = currTaskDescr->RuntimeChunkSize(); - switch (rtSched) { - case omp_sched_static: { - if (chunk > 0) - schedule = kmp_sched_static_chunk; - else - schedule = kmp_sched_static_nochunk; - break; - } - case omp_sched_auto: { - schedule = kmp_sched_static_chunk; - chunk = 1; - break; - } - case omp_sched_dynamic: - case omp_sched_guided: { - schedule = kmp_sched_dynamic; - break; - } - } - PRINT(LD_LOOP, "Runtime sched is %d with chunk %lld\n", (int)schedule, - (long long)chunk); - } else if (schedule == kmp_sched_auto) { - schedule = kmp_sched_static_chunk; - chunk = 1; - PRINT(LD_LOOP, "Auto sched is %d with chunk %lld\n", (int)schedule, - (long long)chunk); - } else { - PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", (int)schedule, - (long long)chunk); - ASSERT(LT_FUSSY, - schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, - "unknown schedule %d & chunk %lld\n", (int)schedule, - (long long)chunk); - } - - // init schedules - if (schedule == kmp_sched_static_chunk) { - ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); - // save sched state - omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; - // save ub - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; - // compute static chunk - ST stride; - int lastiter = 0; - ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); - // save computed params - omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; - omptarget_nvptx_threadPrivateContext->Stride(tid) = stride; - PRINT(LD_LOOP, - "dispatch init (static chunk) : num threads = %d, ub = %" PRId64 - ", next lower bound = %llu, stride = %llu\n", - (int)tnum, - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), - (unsigned long long) - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), - (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride( - tid)); - } else if (schedule == kmp_sched_static_balanced_chunk) { - ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); - // save sched state - omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; - // save ub - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; - // compute static chunk - ST stride; - int lastiter = 0; - // round up to make sure the chunk is enough to cover all iterations - T span = (tripCount + tnum - 1) / tnum; - // perform chunk adjustment - chunk = (span + chunk - 1) & ~(chunk - 1); - - T oldUb = ub; - ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); - ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb."); - if (ub > oldUb) - ub = oldUb; - // save computed params - omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; - omptarget_nvptx_threadPrivateContext->Stride(tid) = stride; - PRINT(LD_LOOP, - "dispatch init (static chunk) : num threads = %d, ub = %" PRId64 - ", next lower bound = %llu, stride = %llu\n", - (int)tnum, - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), - (unsigned long long) - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), - (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride( - tid)); - } else if (schedule == kmp_sched_static_nochunk) { - ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value"); - // save sched state - omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; - // save ub - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; - // compute static chunk - ST stride; - int lastiter = 0; - ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); - // save computed params - omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; - omptarget_nvptx_threadPrivateContext->Stride(tid) = stride; - PRINT(LD_LOOP, - "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64 - ", next lower bound = %llu, stride = %llu\n", - (int)tnum, - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), - (unsigned long long) - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), - (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride( - tid)); - } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) { - // save data - omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; - if (chunk < 1) - chunk = 1; - omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; - __kmpc_barrier(loc, threadId); - if (tid == 0) { - omptarget_nvptx_threadPrivateContext->Cnt() = 0; - __kmpc_impl_threadfence_block(); - } - __kmpc_barrier(loc, threadId); - PRINT(LD_LOOP, - "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64 - ", chunk %" PRIu64 "\n", - (int)tnum, - (unsigned long long) - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), - omptarget_nvptx_threadPrivateContext->Chunk(tid)); - } - } - - //////////////////////////////////////////////////////////////////////////////// - // Support for dispatch next - - INLINE static uint64_t Shuffle(__kmpc_impl_lanemask_t active, int64_t val, - int leader) { - uint32_t lo, hi; - __kmpc_impl_unpack(val, lo, hi); - hi = __kmpc_impl_shfl_sync(active, hi, leader); - lo = __kmpc_impl_shfl_sync(active, lo, leader); - return __kmpc_impl_pack(lo, hi); - } - - INLINE static uint64_t NextIter() { - __kmpc_impl_lanemask_t active = __kmpc_impl_activemask(); - uint32_t leader = __kmpc_impl_ffs(active) - 1; - uint32_t change = __kmpc_impl_popc(active); - __kmpc_impl_lanemask_t lane_mask_lt = __kmpc_impl_lanemask_lt(); - unsigned int rank = __kmpc_impl_popc(active & lane_mask_lt); - uint64_t warp_res; - if (rank == 0) { - warp_res = __kmpc_atomic_add( - (unsigned long long *)&omptarget_nvptx_threadPrivateContext->Cnt(), - (unsigned long long)change); - } - warp_res = Shuffle(active, warp_res, leader); - return warp_res + rank; - } - - INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize, - T loopLowerBound, T loopUpperBound) { - T N = NextIter(); - lb = loopLowerBound + N * chunkSize; - ub = lb + chunkSize - 1; // Clang uses i <= ub - - // 3 result cases: - // a. lb and ub < loopUpperBound --> NOT_FINISHED - // b. lb < loopUpperBound and ub >= loopUpperBound: last chunk --> - // NOT_FINISHED - // c. lb and ub >= loopUpperBound: empty chunk --> FINISHED - // a. - if (lb <= loopUpperBound && ub < loopUpperBound) { - PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n", - (long long)lb, (long long)ub, (long long)loopUpperBound); - return NOT_FINISHED; - } - // b. - if (lb <= loopUpperBound) { - PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; clip to loop ub\n", - (long long)lb, (long long)ub, (long long)loopUpperBound); - ub = loopUpperBound; - return LAST_CHUNK; - } - // c. if we are here, we are in case 'c' - lb = loopUpperBound + 2; - ub = loopUpperBound + 1; - PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", (long long)lb, - (long long)ub, (long long)loopUpperBound); - return FINISHED; - } - - INLINE static int dispatch_next(kmp_Ident *loc, int32_t gtid, int32_t *plast, - T *plower, T *pupper, ST *pstride) { - if (isRuntimeUninitialized()) { - // In SPMD mode no need to check parallelism level - dynamic scheduling - // may appear only in L2 parallel regions with lightweight runtime. - ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(), "Expected non-SPMD mode."); - if (*plast) - return DISPATCH_FINISHED; - *plast = 1; - return DISPATCH_NOTFINISHED; - } - // ID of a thread in its own warp - - // automatically selects thread or warp ID based on selected implementation - int tid = GetLogicalThreadIdInBlock(); - ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode()), - "current thread is not needed here; error"); - // retrieve schedule - kmp_sched_t schedule = - omptarget_nvptx_threadPrivateContext->ScheduleType(tid); - - // xxx reduce to one - if (schedule == kmp_sched_static_chunk || - schedule == kmp_sched_static_nochunk) { - T myLb = omptarget_nvptx_threadPrivateContext->NextLowerBound(tid); - T ub = omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid); - // finished? - if (myLb > ub) { - PRINT(LD_LOOP, "static loop finished with myLb %lld, ub %lld\n", - (long long)myLb, (long long)ub); - return DISPATCH_FINISHED; - } - // not finished, save current bounds - ST chunk = omptarget_nvptx_threadPrivateContext->Chunk(tid); - *plower = myLb; - T myUb = myLb + chunk - 1; // Clang uses i <= ub - if (myUb > ub) - myUb = ub; - *pupper = myUb; - *plast = (int32_t)(myUb == ub); - - // increment next lower bound by the stride - ST stride = omptarget_nvptx_threadPrivateContext->Stride(tid); - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = myLb + stride; - PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n", - (long long)*plower, (long long)*pupper); - return DISPATCH_NOTFINISHED; - } - ASSERT0(LT_FUSSY, - schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, - "bad sched"); - T myLb, myUb; - int finished = DynamicNextChunk( - myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(tid), - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid)); - - if (finished == FINISHED) - return DISPATCH_FINISHED; - - // not finished (either not finished or last chunk) - *plast = (int32_t)(finished == LAST_CHUNK); - *plower = myLb; - *pupper = myUb; - *pstride = 1; - - PRINT(LD_LOOP, - "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, " - "last %d\n", - (int)GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode()), - (int)GetNumberOfWorkersInTeam(), (long long)*plower, - (long long)*pupper, (long long)*pstride, (int)*plast); - return DISPATCH_NOTFINISHED; - } - - INLINE static void dispatch_fini() { - // nothing - } - - //////////////////////////////////////////////////////////////////////////////// - // end of template class that encapsulate all the helper functions - //////////////////////////////////////////////////////////////////////////////// -}; - -//////////////////////////////////////////////////////////////////////////////// -// KMP interface implementation (dyn loops) -//////////////////////////////////////////////////////////////////////////////// - -// init -EXTERN void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t tid, - int32_t schedule, int32_t lb, int32_t ub, - int32_t st, int32_t chunk) { - PRINT0(LD_IO, "call kmpc_dispatch_init_4\n"); - omptarget_nvptx_LoopSupport::dispatch_init( - loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); -} - -EXTERN void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t tid, - int32_t schedule, uint32_t lb, uint32_t ub, - int32_t st, int32_t chunk) { - PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n"); - omptarget_nvptx_LoopSupport::dispatch_init( - loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); -} - -EXTERN void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t tid, - int32_t schedule, int64_t lb, int64_t ub, - int64_t st, int64_t chunk) { - PRINT0(LD_IO, "call kmpc_dispatch_init_8\n"); - omptarget_nvptx_LoopSupport::dispatch_init( - loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); -} - -EXTERN void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t tid, - int32_t schedule, uint64_t lb, uint64_t ub, - int64_t st, int64_t chunk) { - PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n"); - omptarget_nvptx_LoopSupport::dispatch_init( - loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); -} - -// next -EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t tid, int32_t *p_last, - int32_t *p_lb, int32_t *p_ub, int32_t *p_st) { - PRINT0(LD_IO, "call kmpc_dispatch_next_4\n"); - return omptarget_nvptx_LoopSupport::dispatch_next( - loc, tid, p_last, p_lb, p_ub, p_st); -} - -EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t tid, int32_t *p_last, - uint32_t *p_lb, uint32_t *p_ub, - int32_t *p_st) { - PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n"); - return omptarget_nvptx_LoopSupport::dispatch_next( - loc, tid, p_last, p_lb, p_ub, p_st); -} - -EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t tid, int32_t *p_last, - int64_t *p_lb, int64_t *p_ub, int64_t *p_st) { - PRINT0(LD_IO, "call kmpc_dispatch_next_8\n"); - return omptarget_nvptx_LoopSupport::dispatch_next( - loc, tid, p_last, p_lb, p_ub, p_st); -} - -EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t tid, int32_t *p_last, - uint64_t *p_lb, uint64_t *p_ub, - int64_t *p_st) { - PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n"); - return omptarget_nvptx_LoopSupport::dispatch_next( - loc, tid, p_last, p_lb, p_ub, p_st); -} - -// fini -EXTERN void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t tid) { - PRINT0(LD_IO, "call kmpc_dispatch_fini_4\n"); - omptarget_nvptx_LoopSupport::dispatch_fini(); -} - -EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t tid) { - PRINT0(LD_IO, "call kmpc_dispatch_fini_4u\n"); - omptarget_nvptx_LoopSupport::dispatch_fini(); -} - -EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t tid) { - PRINT0(LD_IO, "call kmpc_dispatch_fini_8\n"); - omptarget_nvptx_LoopSupport::dispatch_fini(); -} - -EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t tid) { - PRINT0(LD_IO, "call kmpc_dispatch_fini_8u\n"); - omptarget_nvptx_LoopSupport::dispatch_fini(); -} - -//////////////////////////////////////////////////////////////////////////////// -// KMP interface implementation (static loops) -//////////////////////////////////////////////////////////////////////////////// - -EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - int32_t *plower, int32_t *pupper, - int32_t *pstride, int32_t incr, - int32_t chunk) { - PRINT0(LD_IO, "call kmpc_for_static_init_4\n"); - omptarget_nvptx_LoopSupport::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - __kmpc_is_spmd_exec_mode()); -} - -EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - uint32_t *plower, uint32_t *pupper, - int32_t *pstride, int32_t incr, - int32_t chunk) { - PRINT0(LD_IO, "call kmpc_for_static_init_4u\n"); - omptarget_nvptx_LoopSupport::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - __kmpc_is_spmd_exec_mode()); -} - -EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - int64_t *plower, int64_t *pupper, - int64_t *pstride, int64_t incr, - int64_t chunk) { - PRINT0(LD_IO, "call kmpc_for_static_init_8\n"); - omptarget_nvptx_LoopSupport::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - __kmpc_is_spmd_exec_mode()); -} - -EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - uint64_t *plower, uint64_t *pupper, - int64_t *pstride, int64_t incr, - int64_t chunk) { - PRINT0(LD_IO, "call kmpc_for_static_init_8u\n"); - omptarget_nvptx_LoopSupport::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - __kmpc_is_spmd_exec_mode()); -} - -EXTERN void __kmpc_distribute_static_init_4(kmp_Ident *loc, int32_t global_tid, - int32_t schedtype, - int32_t *plastiter, int32_t *plower, - int32_t *pupper, int32_t *pstride, - int32_t incr, int32_t chunk) { - PRINT0(LD_IO, "call kmpc_distribute_static_init_4\n"); - omptarget_nvptx_LoopSupport::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - __kmpc_is_spmd_exec_mode()); -} - -EXTERN void __kmpc_distribute_static_init_4u(kmp_Ident *loc, int32_t global_tid, - int32_t schedtype, - int32_t *plastiter, - uint32_t *plower, uint32_t *pupper, - int32_t *pstride, int32_t incr, - int32_t chunk) { - PRINT0(LD_IO, "call kmpc_distribute_static_init_4u\n"); - omptarget_nvptx_LoopSupport::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - __kmpc_is_spmd_exec_mode()); -} - -EXTERN void __kmpc_distribute_static_init_8(kmp_Ident *loc, int32_t global_tid, - int32_t schedtype, - int32_t *plastiter, int64_t *plower, - int64_t *pupper, int64_t *pstride, - int64_t incr, int64_t chunk) { - PRINT0(LD_IO, "call kmpc_distribute_static_init_8\n"); - omptarget_nvptx_LoopSupport::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - __kmpc_is_spmd_exec_mode()); -} - -EXTERN void __kmpc_distribute_static_init_8u(kmp_Ident *loc, int32_t global_tid, - int32_t schedtype, - int32_t *plastiter, - uint64_t *plower, uint64_t *pupper, - int64_t *pstride, int64_t incr, - int64_t chunk) { - PRINT0(LD_IO, "call kmpc_distribute_static_init_8u\n"); - omptarget_nvptx_LoopSupport::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - __kmpc_is_spmd_exec_mode()); -} - -EXTERN -void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - int32_t *plower, int32_t *pupper, - int32_t *pstride, int32_t incr, - int32_t chunk) { - PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_spmd\n"); - omptarget_nvptx_LoopSupport::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - /*IsSPMDExecutionMode=*/true); -} - -EXTERN -void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid, - int32_t schedtype, - int32_t *plastiter, uint32_t *plower, - uint32_t *pupper, int32_t *pstride, - int32_t incr, int32_t chunk) { - PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_spmd\n"); - omptarget_nvptx_LoopSupport::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - /*IsSPMDExecutionMode=*/true); -} - -EXTERN -void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - int64_t *plower, int64_t *pupper, - int64_t *pstride, int64_t incr, - int64_t chunk) { - PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_spmd\n"); - omptarget_nvptx_LoopSupport::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - /*IsSPMDExecutionMode=*/true); -} - -EXTERN -void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid, - int32_t schedtype, - int32_t *plastiter, uint64_t *plower, - uint64_t *pupper, int64_t *pstride, - int64_t incr, int64_t chunk) { - PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_spmd\n"); - omptarget_nvptx_LoopSupport::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - /*IsSPMDExecutionMode=*/true); -} - -EXTERN -void __kmpc_for_static_init_4_simple_generic(kmp_Ident *loc, int32_t global_tid, - int32_t schedtype, - int32_t *plastiter, - int32_t *plower, int32_t *pupper, - int32_t *pstride, int32_t incr, - int32_t chunk) { - PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n"); - omptarget_nvptx_LoopSupport::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - /*IsSPMDExecutionMode=*/false); -} - -EXTERN -void __kmpc_for_static_init_4u_simple_generic( - kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, - uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr, - int32_t chunk) { - PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n"); - omptarget_nvptx_LoopSupport::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - /*IsSPMDExecutionMode=*/false); -} - -EXTERN -void __kmpc_for_static_init_8_simple_generic(kmp_Ident *loc, int32_t global_tid, - int32_t schedtype, - int32_t *plastiter, - int64_t *plower, int64_t *pupper, - int64_t *pstride, int64_t incr, - int64_t chunk) { - PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n"); - omptarget_nvptx_LoopSupport::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - /*IsSPMDExecutionMode=*/false); -} - -EXTERN -void __kmpc_for_static_init_8u_simple_generic( - kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, - uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr, - int64_t chunk) { - PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n"); - omptarget_nvptx_LoopSupport::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - /*IsSPMDExecutionMode=*/false); -} - -EXTERN void __kmpc_distribute_static_fini(kmp_Ident *loc, int32_t global_tid) { - PRINT0(LD_IO, "call kmpc_distribute_static_fini\n"); -} - -EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) { - PRINT0(LD_IO, "call kmpc_for_static_fini\n"); -} - -#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu +++ /dev/null @@ -1,65 +0,0 @@ -//===------------ omp_data.cu - OpenMP GPU objects --------------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the data objects used on the GPU device. -// -//===----------------------------------------------------------------------===// -#pragma omp declare target - -#include "common/allocator.h" -#include "common/omptarget.h" - -//////////////////////////////////////////////////////////////////////////////// -// global device environment -//////////////////////////////////////////////////////////////////////////////// - -PLUGIN_ACCESSIBLE -DeviceEnvironmentTy omptarget_device_environment; - -//////////////////////////////////////////////////////////////////////////////// -// global data holding OpenMP state information -//////////////////////////////////////////////////////////////////////////////// - -// OpenMP will try to call its ctor if we don't add the attribute explicitly -[[clang::loader_uninitialized]] omptarget_nvptx_Queue< - omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT> - omptarget_nvptx_device_State[MAX_SM]; - -omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager; -uint32_t SHARED(usedMemIdx); -uint32_t SHARED(usedSlotIdx); - -// SHARED doesn't work with array so we add the attribute explicitly. -[[clang::loader_uninitialized]] uint8_t - parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; -#pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc) -uint16_t SHARED(threadLimit); -uint16_t SHARED(threadsInTeam); -uint16_t SHARED(nThreads); -// Pointer to this team's OpenMP state object -omptarget_nvptx_ThreadPrivateContext * - SHARED(omptarget_nvptx_threadPrivateContext); - -//////////////////////////////////////////////////////////////////////////////// -// The team master sets the outlined parallel function in this variable to -// communicate with the workers. Since it is in shared memory, there is one -// copy of these variables for each kernel, instance, and team. -//////////////////////////////////////////////////////////////////////////////// -omptarget_nvptx_WorkFn SHARED(omptarget_nvptx_workFn); - -//////////////////////////////////////////////////////////////////////////////// -// OpenMP kernel execution parameters -//////////////////////////////////////////////////////////////////////////////// -int8_t SHARED(execution_param); - -//////////////////////////////////////////////////////////////////////////////// -// Scratchpad for teams reduction. -//////////////////////////////////////////////////////////////////////////////// -void *SHARED(ReductionScratchpadPtr); - -#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu +++ /dev/null @@ -1,259 +0,0 @@ -//===--- omptarget.cu - OpenMP GPU initialization ---------------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the initialization code for the GPU -// -//===----------------------------------------------------------------------===// -#pragma omp declare target - -#include "common/omptarget.h" -#include "common/support.h" -#include "target_impl.h" - -//////////////////////////////////////////////////////////////////////////////// -// global data tables -//////////////////////////////////////////////////////////////////////////////// - -extern omptarget_nvptx_Queue - omptarget_nvptx_device_State[MAX_SM]; - -//////////////////////////////////////////////////////////////////////////////// -// init entry points -//////////////////////////////////////////////////////////////////////////////// - -static void __kmpc_generic_kernel_init() { - PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n", - OMPTARGET_NVPTX_VERSION); - - if (GetLaneId() == 0) - parallelLevel[GetWarpId()] = 0; - - int threadIdInBlock = __kmpc_get_hardware_thread_id_in_block(); - if (threadIdInBlock != GetMasterThreadID()) - return; - - setExecutionParameters(OMP_TGT_EXEC_MODE_GENERIC, OMP_TGT_RUNTIME_INITIALIZED); - ASSERT0(LT_FUSSY, threadIdInBlock == GetMasterThreadID(), - "__kmpc_kernel_init() must be called by team master warp only!"); - PRINT0(LD_IO, "call to __kmpc_kernel_init for master\n"); - - // Get a state object from the queue. - int slot = __kmpc_impl_smid() % MAX_SM; - usedSlotIdx = slot; - omptarget_nvptx_threadPrivateContext = - omptarget_nvptx_device_State[slot].Dequeue(); - - // init thread private - int threadId = 0; - omptarget_nvptx_threadPrivateContext->InitThreadPrivateContext(threadId); - - // init team context - omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor(); - currTeamDescr.InitTeamDescr(); - // this thread will start execution... has to update its task ICV - // to point to the level zero task ICV. That ICV was init in - // InitTeamDescr() - omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( - threadId, currTeamDescr.LevelZeroTaskDescr()); - - // set number of threads and thread limit in team to started value - omptarget_nvptx_TaskDescr *currTaskDescr = - omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); - nThreads = GetNumberOfWorkersInTeam(); - threadLimit = nThreads; - - __kmpc_data_sharing_init_stack(); - __kmpc_impl_target_init(); -} - -static void __kmpc_generic_kernel_deinit() { - PRINT0(LD_IO, "call to __kmpc_kernel_deinit\n"); - // Enqueue omp state object for use by another team. - int slot = usedSlotIdx; - omptarget_nvptx_device_State[slot].Enqueue( - omptarget_nvptx_threadPrivateContext); - // Done with work. Kill the workers. - omptarget_nvptx_workFn = 0; -} - -static void __kmpc_spmd_kernel_init(bool RequiresFullRuntime) { - PRINT0(LD_IO, "call to __kmpc_spmd_kernel_init\n"); - - setExecutionParameters(OMP_TGT_EXEC_MODE_SPMD, - RequiresFullRuntime ? OMP_TGT_RUNTIME_INITIALIZED - : OMP_TGT_RUNTIME_UNINITIALIZED); - int threadId = __kmpc_get_hardware_thread_id_in_block(); - if (threadId == 0) { - usedSlotIdx = __kmpc_impl_smid() % MAX_SM; - } - - if (GetLaneId() == 0) { - parallelLevel[GetWarpId()] = - 1 + (__kmpc_get_hardware_num_threads_in_block() > 1 - ? OMP_ACTIVE_PARALLEL_LEVEL - : 0); - } - - __kmpc_data_sharing_init_stack(); - if (!RequiresFullRuntime) - return; - - // - // Team Context Initialization. - // - // In SPMD mode there is no master thread so use any cuda thread for team - // context initialization. - if (threadId == 0) { - // Get a state object from the queue. - omptarget_nvptx_threadPrivateContext = - omptarget_nvptx_device_State[usedSlotIdx].Dequeue(); - - omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor(); - omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); - // init team context - currTeamDescr.InitTeamDescr(); - } - __kmpc_impl_syncthreads(); - - omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor(); - omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); - - // - // Initialize task descr for each thread. - // - omptarget_nvptx_TaskDescr *newTaskDescr = - omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId); - ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr"); - newTaskDescr->InitLevelOneTaskDescr(currTeamDescr.LevelZeroTaskDescr()); - // install new top descriptor - omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, - newTaskDescr); - - // init thread private from init value - int ThreadLimit = GetNumberOfProcsInTeam(/* IsSPMD */ true); - PRINT(LD_PAR, - "thread will execute parallel region with id %d in a team of " - "%d threads\n", - (int)newTaskDescr->ThreadId(), (int)ThreadLimit); -} - -static void __kmpc_spmd_kernel_deinit(bool RequiresFullRuntime) { - // We're not going to pop the task descr stack of each thread since - // there are no more parallel regions in SPMD mode. - if (!RequiresFullRuntime) - return; - - __kmpc_impl_syncthreads(); - int threadId = __kmpc_get_hardware_thread_id_in_block(); - if (threadId == 0) { - // Enqueue omp state object for use by another team. - int slot = usedSlotIdx; - omptarget_nvptx_device_State[slot].Enqueue( - omptarget_nvptx_threadPrivateContext); - } -} - -// Return true if the current target region is executed in SPMD mode. -// NOTE: This function has to return 1 for SPMD mode, and 0 for generic mode. -// That's because `__kmpc_parallel_51` checks if it's already in parallel region -// by comparision between the parallel level and the return value of this -// function. -EXTERN int8_t __kmpc_is_spmd_exec_mode() { - return (execution_param & OMP_TGT_EXEC_MODE_SPMD) == OMP_TGT_EXEC_MODE_SPMD; -} - -EXTERN int8_t __kmpc_is_generic_main_thread(kmp_int32 Tid) { - return !__kmpc_is_spmd_exec_mode() && __kmpc_is_generic_main_thread_id(Tid); -} - -NOINLINE EXTERN int8_t __kmpc_is_generic_main_thread_id(kmp_int32 Tid) { - return GetMasterThreadID() == Tid; -} - -EXTERN bool __kmpc_kernel_parallel(void**WorkFn); - -static void __kmpc_target_region_state_machine(ident_t *Ident) { - - int TId = __kmpc_get_hardware_thread_id_in_block(); - do { - void* WorkFn = 0; - - // Wait for the signal that we have a new work function. - __kmpc_barrier_simple_spmd(Ident, TId); - - - // Retrieve the work function from the runtime. - bool IsActive = __kmpc_kernel_parallel(&WorkFn); - - // If there is nothing more to do, break out of the state machine by - // returning to the caller. - if (!WorkFn) - return; - - if (IsActive) { - ((void(*)(uint32_t,uint32_t))WorkFn)(0, TId); - __kmpc_kernel_end_parallel(); - } - - __kmpc_barrier_simple_spmd(Ident, TId); - - } while (true); -} - -EXTERN -int32_t __kmpc_target_init(ident_t *Ident, int8_t Mode, - bool UseGenericStateMachine, - bool RequiresFullRuntime) { - const bool IsSPMD = Mode & OMP_TGT_EXEC_MODE_SPMD; - int TId = __kmpc_get_hardware_thread_id_in_block(); - if (IsSPMD) - __kmpc_spmd_kernel_init(RequiresFullRuntime); - else - __kmpc_generic_kernel_init(); - - if (IsSPMD) { - __kmpc_barrier_simple_spmd(Ident, TId); - return -1; - } - - if (TId == GetMasterThreadID()) - return -1; - - // Enter the generic state machine if enabled and if this thread can possibly - // be an active worker thread. - // - // The latter check is important for NVIDIA Pascal (but not Volta) and AMD - // GPU. In those cases, a single thread can apparently satisfy a barrier on - // behalf of all threads in the same warp. Thus, it would not be safe for - // other threads in the main thread's warp to reach the first - // __kmpc_barrier_simple_spmd call in __kmpc_target_region_state_machine - // before the main thread reaches its corresponding - // __kmpc_barrier_simple_spmd call: that would permit all active worker - // threads to proceed before the main thread has actually set - // omptarget_nvptx_workFn, and then they would immediately quit without - // doing any work. GetNumberOfWorkersInTeam() does not include any of the - // main thread's warp, so none of its threads can ever be active worker - // threads. - if (UseGenericStateMachine && TId < GetNumberOfWorkersInTeam()) - __kmpc_target_region_state_machine(Ident); - - return TId; -} - -EXTERN -void __kmpc_target_deinit(ident_t *Ident, int8_t Mode, - bool RequiresFullRuntime) { - const bool IsSPMD = Mode & OMP_TGT_EXEC_MODE_SPMD; - if (IsSPMD) - __kmpc_spmd_kernel_deinit(RequiresFullRuntime); - else - __kmpc_generic_kernel_deinit(); -} - -#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu +++ /dev/null @@ -1,341 +0,0 @@ -//===---- parallel.cu - GPU OpenMP parallel implementation ------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Parallel implementation in the GPU. Here is the pattern: -// -// while (not finished) { -// -// if (master) { -// sequential code, decide which par loop to do, or if finished -// __kmpc_kernel_prepare_parallel() // exec by master only -// } -// syncthreads // A -// __kmpc_kernel_parallel() // exec by all -// if (this thread is included in the parallel) { -// switch () for all parallel loops -// __kmpc_kernel_end_parallel() // exec only by threads in parallel -// } -// -// -// The reason we don't exec end_parallel for the threads not included -// in the parallel loop is that for each barrier in the parallel -// region, these non-included threads will cycle through the -// syncthread A. Thus they must preserve their current threadId that -// is larger than thread in team. -// -// To make a long story short... -// -//===----------------------------------------------------------------------===// -#pragma omp declare target - -#include "common/omptarget.h" -#include "target_impl.h" - -//////////////////////////////////////////////////////////////////////////////// -// support for parallel that goes parallel (1 static level only) -//////////////////////////////////////////////////////////////////////////////// - -INLINE static uint16_t determineNumberOfThreads(uint16_t NumThreadsClause, - uint16_t NThreadsICV, - uint16_t ThreadLimit) { - uint16_t ThreadsRequested = NThreadsICV; - if (NumThreadsClause != 0) { - ThreadsRequested = NumThreadsClause; - } - - uint16_t ThreadsAvailable = GetNumberOfWorkersInTeam(); - if (ThreadLimit != 0 && ThreadLimit < ThreadsAvailable) { - ThreadsAvailable = ThreadLimit; - } - - uint16_t NumThreads = ThreadsAvailable; - if (ThreadsRequested != 0 && ThreadsRequested < NumThreads) { - NumThreads = ThreadsRequested; - } - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 - // On Volta and newer architectures we require that all lanes in - // a warp participate in the parallel region. Round down to a - // multiple of WARPSIZE since it is legal to do so in OpenMP. - if (NumThreads < WARPSIZE) { - NumThreads = 1; - } else { - NumThreads = (NumThreads & ~((uint16_t)WARPSIZE - 1)); - } -#endif - - return NumThreads; -} - -// This routine is always called by the team master.. -EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, - kmp_int32 NumThreadsClause) { - PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n"); - - omptarget_nvptx_workFn = WorkFn; - - // This routine is only called by the team master. The team master is - // the first thread of the last warp. It always has the logical thread - // id of 0 (since it is a shadow for the first worker thread). - const int threadId = 0; - omptarget_nvptx_TaskDescr *currTaskDescr = - omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); - ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr"); - ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(), - "cannot be called in a parallel region."); - if (currTaskDescr->InParallelRegion()) { - PRINT0(LD_PAR, "already in parallel: go seq\n"); - return; - } - - uint16_t NumThreads = - determineNumberOfThreads(NumThreadsClause, nThreads, threadLimit); - - if (NumThreadsClause != 0) { - // Reset request to avoid propagating to successive #parallel - NumThreadsClause = 0; - } - - ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads", - (int)NumThreads); - ASSERT0(LT_FUSSY, - __kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(), - "only team master can create parallel"); - - // Set number of threads on work descriptor. - omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); - workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr); - threadsInTeam = NumThreads; -} - -// All workers call this function. Deactivate those not needed. -// Fn - the outlined work function to execute. -// returns True if this thread is active, else False. -// -// Only the worker threads call this routine. -EXTERN bool __kmpc_kernel_parallel(void **WorkFn) { - PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n"); - - // Work function and arguments for L1 parallel region. - *WorkFn = omptarget_nvptx_workFn; - - // If this is the termination signal from the master, quit early. - if (!*WorkFn) { - PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel finished\n"); - return false; - } - - // Only the worker threads call this routine and the master warp - // never arrives here. Therefore, use the nvptx thread id. - int threadId = __kmpc_get_hardware_thread_id_in_block(); - omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); - // Set to true for workers participating in the parallel region. - bool isActive = false; - // Initialize state for active threads. - if (threadId < threadsInTeam) { - // init work descriptor from workdesccr - omptarget_nvptx_TaskDescr *newTaskDescr = - omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId); - ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr"); - newTaskDescr->CopyFromWorkDescr(workDescr.WorkTaskDescr()); - // install new top descriptor - omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, - newTaskDescr); - // init private from int value - PRINT(LD_PAR, - "thread will execute parallel region with id %d in a team of " - "%d threads\n", - (int)newTaskDescr->ThreadId(), (int)nThreads); - - isActive = true; - } - - return isActive; -} - -EXTERN void __kmpc_kernel_end_parallel() { - // pop stack - PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_parallel\n"); - ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime."); - - // Only the worker threads call this routine and the master warp - // never arrives here. Therefore, use the nvptx thread id. - int threadId = __kmpc_get_hardware_thread_id_in_block(); - omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); - omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( - threadId, currTaskDescr->GetPrevTaskDescr()); -} - -//////////////////////////////////////////////////////////////////////////////// -// support for parallel that goes sequential -//////////////////////////////////////////////////////////////////////////////// - -static void serializedParallel(kmp_Ident *loc, uint32_t global_tid) { - PRINT0(LD_IO, "call to serializedParallel\n"); - - IncParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask()); - - if (isRuntimeUninitialized()) { - ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(), - "Expected SPMD mode with uninitialized runtime."); - return; - } - - // assume this is only called for nested parallel - int threadId = GetLogicalThreadIdInBlock(); - - // unlike actual parallel, threads in the same team do not share - // the workTaskDescr in this case and num threads is fixed to 1 - - // get current task - omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); - currTaskDescr->SaveLoopData(); - - // allocate new task descriptor and copy value from current one, set prev to - // it - omptarget_nvptx_TaskDescr *newTaskDescr = - (omptarget_nvptx_TaskDescr *)SafeMalloc(sizeof(omptarget_nvptx_TaskDescr), - "new seq parallel task"); - newTaskDescr->CopyParent(currTaskDescr); - - // tweak values for serialized parallel case: - // - each thread becomes ID 0 in its serialized parallel, and - // - there is only one thread per team - newTaskDescr->ThreadId() = 0; - - // set new task descriptor as top - omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, - newTaskDescr); -} - -static void endSerializedParallel(kmp_Ident *loc, - uint32_t global_tid) { - PRINT0(LD_IO, "call to endSerializedParallel\n"); - - DecParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask()); - - if (isRuntimeUninitialized()) { - ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(), - "Expected SPMD mode with uninitialized runtime."); - return; - } - - // pop stack - int threadId = GetLogicalThreadIdInBlock(); - omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); - // set new top - omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( - threadId, currTaskDescr->GetPrevTaskDescr()); - // free - SafeFree(currTaskDescr, "new seq parallel task"); - currTaskDescr = getMyTopTaskDescriptor(threadId); - currTaskDescr->RestoreLoopData(); -} - -NOINLINE EXTERN uint8_t __kmpc_parallel_level() { - return parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1); -} - -// This kmpc call returns the thread id across all teams. It's value is -// cached by the compiler and used when calling the runtime. On nvptx -// it's cheap to recalculate this value so we never use the result -// of this call. -EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) { - return GetOmpThreadId(); -} - -//////////////////////////////////////////////////////////////////////////////// -// push params -//////////////////////////////////////////////////////////////////////////////// - -// Do nothing. The host guarantees we started the requested number of -// teams and we only need inspection of gridDim. - -EXTERN void __kmpc_push_num_teams(kmp_Ident *loc, int32_t tid, - int32_t num_teams, int32_t thread_limit) { - PRINT(LD_IO, "call kmpc_push_num_teams %d\n", (int)num_teams); - ASSERT0(LT_FUSSY, 0, "should never have anything with new teams on device"); -} - -EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t tid, int proc_bind) { - PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", (int)proc_bind); -} - -//////////////////////////////////////////////////////////////////////////////// -// parallel interface -//////////////////////////////////////////////////////////////////////////////// - -NOINLINE EXTERN void __kmpc_parallel_51(kmp_Ident *ident, kmp_int32 global_tid, - kmp_int32 if_expr, - kmp_int32 num_threads, int proc_bind, - void *fn, void *wrapper_fn, void **args, - size_t nargs) { - // Handle the serialized case first, same for SPMD/non-SPMD except that in - // SPMD mode we already incremented the parallel level counter, account for - // that. - bool InParallelRegion = - (__kmpc_parallel_level() > __kmpc_is_spmd_exec_mode()); - if (!if_expr || InParallelRegion) { - serializedParallel(ident, global_tid); - __kmp_invoke_microtask(global_tid, 0, fn, args, nargs); - endSerializedParallel(ident, global_tid); - return; - } - - if (__kmpc_is_spmd_exec_mode()) { - __kmp_invoke_microtask(global_tid, 0, fn, args, nargs); - return; - } - - __kmpc_kernel_prepare_parallel((void *)wrapper_fn, num_threads); - - if (nargs) { - void **GlobalArgs; - __kmpc_begin_sharing_variables(&GlobalArgs, nargs); - // TODO: faster memcpy? -#pragma unroll - for (int I = 0; I < nargs; I++) - GlobalArgs[I] = args[I]; - } - - // TODO: what if that's a parallel region with a single thread? this is - // considered not active in the existing implementation. - bool IsActiveParallelRegion = threadsInTeam != 1; - int NumWarps = - threadsInTeam / WARPSIZE + ((threadsInTeam % WARPSIZE) ? 1 : 0); - // Increment parallel level for non-SPMD warps. - for (int I = 0; I < NumWarps; ++I) - parallelLevel[I] += - (1 + (IsActiveParallelRegion ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); - - // Master signals work to activate workers. - __kmpc_barrier_simple_spmd(ident, 0); - - // OpenMP [2.5, Parallel Construct, p.49] - // There is an implied barrier at the end of a parallel region. After the - // end of a parallel region, only the master thread of the team resumes - // execution of the enclosing task region. - // - // The master waits at this barrier until all workers are done. - __kmpc_barrier_simple_spmd(ident, 0); - - // Decrement parallel level for non-SPMD warps. - for (int I = 0; I < NumWarps; ++I) - parallelLevel[I] -= - (1 + (IsActiveParallelRegion ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); - // TODO: Is synchronization needed since out of parallel execution? - - if (nargs) - __kmpc_end_sharing_variables(); - - // TODO: proc_bind is a noop? - // if (proc_bind != proc_bind_default) - // __kmpc_push_proc_bind(ident, global_tid, proc_bind); -} - -#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu +++ /dev/null @@ -1,309 +0,0 @@ -//===---- reduction.cu - GPU OpenMP reduction implementation ----- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the implementation of reduction with KMPC interface. -// -//===----------------------------------------------------------------------===// -#pragma omp declare target - -#include "common/omptarget.h" -#include "target/shuffle.h" -#include "target_impl.h" - -EXTERN -void __kmpc_nvptx_end_reduce(int32_t global_tid) {} - -EXTERN -void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {} - -INLINE static void gpu_regular_warp_reduce(void *reduce_data, - kmp_ShuffleReductFctPtr shflFct) { - for (uint32_t mask = WARPSIZE / 2; mask > 0; mask /= 2) { - shflFct(reduce_data, /*LaneId - not used= */ 0, - /*Offset = */ mask, /*AlgoVersion=*/0); - } -} - -INLINE static void gpu_irregular_warp_reduce(void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, - uint32_t size, uint32_t tid) { - uint32_t curr_size; - uint32_t mask; - curr_size = size; - mask = curr_size / 2; - while (mask > 0) { - shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1); - curr_size = (curr_size + 1) / 2; - mask = curr_size / 2; - } -} - -#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 -INLINE static uint32_t -gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) { - uint32_t size, remote_id, physical_lane_id; - physical_lane_id = __kmpc_get_hardware_thread_id_in_block() % WARPSIZE; - __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt(); - __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask(); - uint32_t logical_lane_id = __kmpc_impl_popc(Liveness & lanemask_lt) * 2; - __kmpc_impl_lanemask_t lanemask_gt = __kmpc_impl_lanemask_gt(); - do { - Liveness = __kmpc_impl_activemask(); - remote_id = __kmpc_impl_ffs(Liveness & lanemask_gt); - size = __kmpc_impl_popc(Liveness); - logical_lane_id /= 2; - shflFct(reduce_data, /*LaneId =*/logical_lane_id, - /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2); - } while (logical_lane_id % 2 == 0 && size > 1); - return (logical_lane_id == 0); -} -#endif - -INLINE -static int32_t nvptx_parallel_reduce_nowait( - int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, - bool isSPMDExecutionMode, bool isRuntimeUninitialized) { - uint32_t BlockThreadId = GetLogicalThreadIdInBlock(); - uint32_t NumThreads = GetNumberOfOmpThreads(isSPMDExecutionMode); - if (NumThreads == 1) - return 1; - /* - * This reduce function handles reduction within a team. It handles - * parallel regions in both L1 and L2 parallelism levels. It also - * supports Generic, SPMD, and NoOMP modes. - * - * 1. Reduce within a warp. - * 2. Warp master copies value to warp 0 via shared memory. - * 3. Warp 0 reduces to a single value. - * 4. The reduced value is available in the thread that returns 1. - */ - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 - uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE; - uint32_t WarpId = BlockThreadId / WARPSIZE; - - // Volta execution model: - // For the Generic execution mode a parallel region either has 1 thread and - // beyond that, always a multiple of 32. For the SPMD execution mode we may - // have any number of threads. - if ((NumThreads % WARPSIZE == 0) || (WarpId < WarpsNeeded - 1)) - gpu_regular_warp_reduce(reduce_data, shflFct); - else if (NumThreads > 1) // Only SPMD execution mode comes thru this case. - gpu_irregular_warp_reduce( - reduce_data, shflFct, - /*LaneCount=*/NumThreads % WARPSIZE, - /*LaneId=*/__kmpc_get_hardware_thread_id_in_block() % WARPSIZE); - - // When we have more than [warpsize] number of threads - // a block reduction is performed here. - // - // Only L1 parallel region can enter this if condition. - if (NumThreads > WARPSIZE) { - // Gather all the reduced values from each warp - // to the first warp. - cpyFct(reduce_data, WarpsNeeded); - - if (WarpId == 0) - gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, - BlockThreadId); - } - return BlockThreadId == 0; -#else - __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask(); - if (Liveness == __kmpc_impl_all_lanes) // Full warp - gpu_regular_warp_reduce(reduce_data, shflFct); - else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes - gpu_irregular_warp_reduce( - reduce_data, shflFct, - /*LaneCount=*/__kmpc_impl_popc(Liveness), - /*LaneId=*/__kmpc_get_hardware_thread_id_in_block() % WARPSIZE); - else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2 - // parallel region may enter here; return - // early. - return gpu_irregular_simd_reduce(reduce_data, shflFct); - - // When we have more than [warpsize] number of threads - // a block reduction is performed here. - // - // Only L1 parallel region can enter this if condition. - if (NumThreads > WARPSIZE) { - uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE; - // Gather all the reduced values from each warp - // to the first warp. - cpyFct(reduce_data, WarpsNeeded); - - uint32_t WarpId = BlockThreadId / WARPSIZE; - if (WarpId == 0) - gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, - BlockThreadId); - - return BlockThreadId == 0; - } else if (isRuntimeUninitialized /* Never an L2 parallel region without the OMP runtime */) { - return BlockThreadId == 0; - } - - // Get the OMP thread Id. This is different from BlockThreadId in the case of - // an L2 parallel region. - return global_tid == 0; -#endif // __CUDA_ARCH__ >= 700 -} - -EXTERN -int32_t __kmpc_nvptx_parallel_reduce_nowait_v2( - kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size, - void *reduce_data, kmp_ShuffleReductFctPtr shflFct, - kmp_InterWarpCopyFctPtr cpyFct) { - return nvptx_parallel_reduce_nowait( - global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct, - __kmpc_is_spmd_exec_mode(), isRuntimeUninitialized()); -} - -INLINE static bool isMaster(kmp_Ident *loc, uint32_t ThreadId) { - return !__kmpc_is_spmd_exec_mode() || IsTeamMaster(ThreadId); -} - -INLINE static uint32_t roundToWarpsize(uint32_t s) { - if (s < WARPSIZE) - return 1; - return (s & ~(unsigned)(WARPSIZE - 1)); -} - -INLINE static uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; } - -static volatile uint32_t IterCnt = 0; -static volatile uint32_t Cnt = 0; -EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2( - kmp_Ident *loc, int32_t global_tid, void *global_buffer, - int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct, - kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct, - kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct, - kmp_ListGlobalFctPtr glredFct) { - - // Terminate all threads in non-SPMD mode except for the master thread. - if (!__kmpc_is_spmd_exec_mode() && - !__kmpc_is_generic_main_thread(__kmpc_get_hardware_thread_id_in_block())) - return 0; - - uint32_t ThreadId = GetLogicalThreadIdInBlock(); - - // In non-generic mode all workers participate in the teams reduction. - // In generic mode only the team master participates in the teams - // reduction because the workers are waiting for parallel work. - uint32_t NumThreads = - __kmpc_is_spmd_exec_mode() ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true) - : /*Master thread only*/ 1; - uint32_t TeamId = GetBlockIdInKernel(); - uint32_t NumTeams = __kmpc_get_hardware_num_blocks(); - static unsigned SHARED(Bound); - static unsigned SHARED(ChunkTeamCount); - - // Block progress for teams greater than the current upper - // limit. We always only allow a number of teams less or equal - // to the number of slots in the buffer. - bool IsMaster = isMaster(loc, ThreadId); - while (IsMaster) { - // Atomic read - Bound = __kmpc_atomic_add((uint32_t *)&IterCnt, 0u); - if (TeamId < Bound + num_of_records) - break; - } - - if (IsMaster) { - int ModBockId = TeamId % num_of_records; - if (TeamId < num_of_records) - lgcpyFct(global_buffer, ModBockId, reduce_data); - else - lgredFct(global_buffer, ModBockId, reduce_data); - __kmpc_impl_threadfence_system(); - - // Increment team counter. - // This counter is incremented by all teams in the current - // BUFFER_SIZE chunk. - ChunkTeamCount = __kmpc_atomic_inc((uint32_t *)&Cnt, num_of_records - 1u); - } - // Synchronize - if (__kmpc_is_spmd_exec_mode()) - __kmpc_barrier(loc, global_tid); - - // reduce_data is global or shared so before being reduced within the - // warp we need to bring it in local memory: - // local_reduce_data = reduce_data[i] - // - // Example for 3 reduction variables a, b, c (of potentially different - // types): - // - // buffer layout (struct of arrays): - // a, a, ..., a, b, b, ... b, c, c, ... c - // |__________| - // num_of_records - // - // local_data_reduce layout (struct): - // a, b, c - // - // Each thread will have a local struct containing the values to be - // reduced: - // 1. do reduction within each warp. - // 2. do reduction across warps. - // 3. write the final result to the main reduction variable - // by returning 1 in the thread holding the reduction result. - - // Check if this is the very last team. - unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records)); - if (ChunkTeamCount == NumTeams - Bound - 1) { - // - // Last team processing. - // - if (ThreadId >= NumRecs) - return 0; - NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs)); - if (ThreadId >= NumThreads) - return 0; - - // Load from buffer and reduce. - glcpyFct(global_buffer, ThreadId, reduce_data); - for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads) - glredFct(global_buffer, i, reduce_data); - - // Reduce across warps to the warp master. - if (NumThreads > 1) { - gpu_regular_warp_reduce(reduce_data, shflFct); - - // When we have more than [warpsize] number of threads - // a block reduction is performed here. - uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads); - if (ActiveThreads > WARPSIZE) { - uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE; - // Gather all the reduced values from each warp - // to the first warp. - cpyFct(reduce_data, WarpsNeeded); - - uint32_t WarpId = ThreadId / WARPSIZE; - if (WarpId == 0) - gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, - ThreadId); - } - } - - if (IsMaster) { - Cnt = 0; - IterCnt = 0; - return 1; - } - return 0; - } - if (IsMaster && ChunkTeamCount == num_of_records - 1) { - // Allow SIZE number of teams to proceed writing their - // intermediate results to the global buffer. - __kmpc_atomic_add((uint32_t *)&IterCnt, uint32_t(num_of_records)); - } - - return 0; -} - -#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/shuffle.cpp b/openmp/libomptarget/deviceRTLs/common/src/shuffle.cpp deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/shuffle.cpp +++ /dev/null @@ -1,29 +0,0 @@ -//===--- shuffle.cpp - Implementation of the external shuffle idiom API -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -//===----------------------------------------------------------------------===// - -#include "target/shuffle.h" - -#pragma omp declare target - -static constexpr uint64_t AllLanes = -1; - -int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) { - return __kmpc_impl_shfl_down_sync(AllLanes, val, delta, size); -} - -int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) { - uint32_t lo, hi; - __kmpc_impl_unpack(val, lo, hi); - hi = __kmpc_impl_shfl_down_sync(AllLanes, hi, delta, size); - lo = __kmpc_impl_shfl_down_sync(AllLanes, lo, delta, size); - return __kmpc_impl_pack(lo, hi); -} - -#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/support.cu b/openmp/libomptarget/deviceRTLs/common/src/support.cu deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/support.cu +++ /dev/null @@ -1,240 +0,0 @@ -//===--------- support.cu - GPU OpenMP support functions --------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Wrapper implementation to some functions natively supported by the GPU. -// -//===----------------------------------------------------------------------===// -#pragma omp declare target - -#include "common/debug.h" -#include "common/omptarget.h" -#include "common/support.h" - -//////////////////////////////////////////////////////////////////////////////// -// Execution Parameters -//////////////////////////////////////////////////////////////////////////////// - -void setExecutionParameters(OMPTgtExecModeFlags EMode, - OMPTgtRuntimeModeFlags RMode) { - execution_param = EMode; - execution_param |= RMode; -} - -bool isGenericMode() { return execution_param & OMP_TGT_EXEC_MODE_GENERIC; } - -bool isRuntimeUninitialized() { return !isRuntimeInitialized(); } - -bool isRuntimeInitialized() { - return execution_param & OMP_TGT_RUNTIME_INITIALIZED; -} - -//////////////////////////////////////////////////////////////////////////////// -// support: get info from machine -//////////////////////////////////////////////////////////////////////////////// - -//////////////////////////////////////////////////////////////////////////////// -// -// Calls to the Generic Scheme Implementation Layer (assuming 1D layout) -// -//////////////////////////////////////////////////////////////////////////////// - -// The master thread id is the first thread (lane) of the last warp. -// Thread id is 0 indexed. -// E.g: If NumThreads is 33, master id is 32. -// If NumThreads is 64, master id is 32. -// If NumThreads is 97, master id is 96. -// If NumThreads is 1024, master id is 992. -// -// Called in Generic Execution Mode only. -int GetMasterThreadID() { - return (__kmpc_get_hardware_num_threads_in_block() - 1) & ~(WARPSIZE - 1); -} - -// The last warp is reserved for the master; other warps are workers. -// Called in Generic Execution Mode only. -int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); } - -//////////////////////////////////////////////////////////////////////////////// -// get thread id in team - -// This function may be called in a parallel region by the workers -// or a serial region by the master. If the master (whose CUDA thread -// id is GetMasterThreadID()) calls this routine, we return 0 because -// it is a shadow for the first worker. -int GetLogicalThreadIdInBlock() { - // Implemented using control flow (predication) instead of with a modulo - // operation. - int tid = __kmpc_get_hardware_thread_id_in_block(); - if (__kmpc_is_generic_main_thread(tid)) - return 0; - else - return tid; -} - -//////////////////////////////////////////////////////////////////////////////// -// -// OpenMP Thread Support Layer -// -//////////////////////////////////////////////////////////////////////////////// - -int GetOmpThreadId() { - int tid = __kmpc_get_hardware_thread_id_in_block(); - if (__kmpc_is_generic_main_thread(tid)) - return 0; - // omp_thread_num - int rc; - if (__kmpc_parallel_level() > 1) { - rc = 0; - } else if (__kmpc_is_spmd_exec_mode()) { - rc = tid; - } else { - omptarget_nvptx_TaskDescr *currTaskDescr = - omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid); - ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr"); - rc = currTaskDescr->ThreadId(); - } - return rc; -} - -int GetNumberOfOmpThreads(bool isSPMDExecutionMode) { - // omp_num_threads - int rc; - int Level = parallelLevel[GetWarpId()]; - if (Level != OMP_ACTIVE_PARALLEL_LEVEL + 1) { - rc = 1; - } else if (isSPMDExecutionMode) { - rc = __kmpc_get_hardware_num_threads_in_block(); - } else { - rc = threadsInTeam; - } - - return rc; -} - -//////////////////////////////////////////////////////////////////////////////// -// Team id linked to OpenMP - -int GetOmpTeamId() { - // omp_team_num - return GetBlockIdInKernel(); // assume 1 block per team -} - -int GetNumberOfOmpTeams() { - // omp_num_teams - return __kmpc_get_hardware_num_blocks(); // assume 1 block per team -} - -//////////////////////////////////////////////////////////////////////////////// -// Masters - -int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); } - -//////////////////////////////////////////////////////////////////////////////// -// Parallel level - -void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) { - __kmpc_impl_syncwarp(Mask); - __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt(); - unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt); - if (Rank == 0) { - parallelLevel[GetWarpId()] += - (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); - __kmpc_impl_threadfence(); - } - __kmpc_impl_syncwarp(Mask); -} - -void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) { - __kmpc_impl_syncwarp(Mask); - __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt(); - unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt); - if (Rank == 0) { - parallelLevel[GetWarpId()] -= - (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); - __kmpc_impl_threadfence(); - } - __kmpc_impl_syncwarp(Mask); -} - -//////////////////////////////////////////////////////////////////////////////// -// get OpenMP number of procs - -// Get the number of processors in the device. -int GetNumberOfProcsInDevice(bool isSPMDExecutionMode) { - if (!isSPMDExecutionMode) - return GetNumberOfWorkersInTeam(); - return __kmpc_get_hardware_num_threads_in_block(); -} - -int GetNumberOfProcsInTeam(bool isSPMDExecutionMode) { - return GetNumberOfProcsInDevice(isSPMDExecutionMode); -} - -//////////////////////////////////////////////////////////////////////////////// -// Memory -//////////////////////////////////////////////////////////////////////////////// - -unsigned long PadBytes(unsigned long size, - unsigned long alignment) // must be a power of 2 -{ - // compute the necessary padding to satisfy alignment constraint - ASSERT(LT_FUSSY, (alignment & (alignment - 1)) == 0, - "alignment %lu is not a power of 2\n", alignment); - return (~(unsigned long)size + 1) & (alignment - 1); -} - -void *SafeMalloc(size_t size, const char *msg) // check if success -{ - void *ptr = __kmpc_impl_malloc(size); - PRINT(LD_MEM, "malloc data of size %llu for %s: 0x%llx\n", - (unsigned long long)size, msg, (unsigned long long)ptr); - return ptr; -} - -void *SafeFree(void *ptr, const char *msg) { - PRINT(LD_MEM, "free data ptr 0x%llx for %s\n", (unsigned long long)ptr, msg); - __kmpc_impl_free(ptr); - return NULL; -} - -//////////////////////////////////////////////////////////////////////////////// -// Teams Reduction Scratchpad Helpers -//////////////////////////////////////////////////////////////////////////////// - -unsigned int *GetTeamsReductionTimestamp() { - return static_cast(ReductionScratchpadPtr); -} - -char *GetTeamsReductionScratchpad() { - return static_cast(ReductionScratchpadPtr) + 256; -} - -// Invoke an outlined parallel function unwrapping arguments (up -// to 32). -void __kmp_invoke_microtask(kmp_int32 global_tid, kmp_int32 bound_tid, void *fn, - void **args, size_t nargs) { - switch (nargs) { -#include "common/generated_microtask_cases.gen" - default: - printf("Too many arguments in kmp_invoke_microtask, aborting execution.\n"); - __builtin_trap(); - } -} - -namespace _OMP { -/// Helper to keep code alive without introducing a performance penalty. -__attribute__((used, retain, weak, optnone, cold)) void keepAlive() { - __kmpc_get_hardware_thread_id_in_block(); - __kmpc_get_hardware_num_threads_in_block(); - __kmpc_get_warp_size(); - __kmpc_barrier_simple_spmd(nullptr, 0); - __kmpc_barrier_simple_generic(nullptr, 0); -} -} // namespace _OMP - -#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/sync.cu b/openmp/libomptarget/deviceRTLs/common/src/sync.cu deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/sync.cu +++ /dev/null @@ -1,143 +0,0 @@ -//===------------ sync.cu - GPU OpenMP synchronizations ---------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Include all synchronization. -// -//===----------------------------------------------------------------------===// -#pragma omp declare target - -#include "common/omptarget.h" -#include "target_impl.h" - -//////////////////////////////////////////////////////////////////////////////// -// KMP Ordered calls -//////////////////////////////////////////////////////////////////////////////// - -EXTERN void __kmpc_ordered(kmp_Ident *loc, int32_t tid) { - PRINT0(LD_IO, "call kmpc_ordered\n"); -} - -EXTERN void __kmpc_end_ordered(kmp_Ident *loc, int32_t tid) { - PRINT0(LD_IO, "call kmpc_end_ordered\n"); -} - -//////////////////////////////////////////////////////////////////////////////// -// KMP Barriers -//////////////////////////////////////////////////////////////////////////////// - -// a team is a block: we can use CUDA native synchronization mechanism -// FIXME: what if not all threads (warps) participate to the barrier? -// We may need to implement it differently - -EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc_ref, int32_t tid) { - PRINT0(LD_IO, "call kmpc_cancel_barrier\n"); - __kmpc_barrier(loc_ref, tid); - PRINT0(LD_SYNC, "completed kmpc_cancel_barrier\n"); - return 0; -} - -EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid) { - if (isRuntimeUninitialized()) { - ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(), - "Expected SPMD mode with uninitialized runtime."); - __kmpc_barrier_simple_spmd(loc_ref, tid); - } else { - tid = GetLogicalThreadIdInBlock(); - int numberOfActiveOMPThreads = - GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode()); - if (numberOfActiveOMPThreads > 1) { - if (__kmpc_is_spmd_exec_mode()) { - __kmpc_barrier_simple_spmd(loc_ref, tid); - } else { - // The #threads parameter must be rounded up to the WARPSIZE. - int threads = - WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE); - - PRINT(LD_SYNC, - "call kmpc_barrier with %d omp threads, sync parameter %d\n", - (int)numberOfActiveOMPThreads, (int)threads); - __kmpc_impl_named_sync(threads); - } - } else { - // Still need to flush the memory per the standard. - __kmpc_flush(loc_ref); - } // numberOfActiveOMPThreads > 1 - PRINT0(LD_SYNC, "completed kmpc_barrier\n"); - } -} - -// Emit a simple barrier call in SPMD mode. Assumes the caller is in an L0 -// parallel region and that all worker threads participate. -EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid) { - PRINT0(LD_SYNC, "call kmpc_barrier_simple_spmd\n"); - __kmpc_impl_syncthreads(); - PRINT0(LD_SYNC, "completed kmpc_barrier_simple_spmd\n"); -} -EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid) { - return __kmpc_barrier_simple_spmd(loc_ref, tid); -} - -//////////////////////////////////////////////////////////////////////////////// -// KMP MASTER -//////////////////////////////////////////////////////////////////////////////// - -EXTERN int32_t __kmpc_master(kmp_Ident *loc, int32_t global_tid) { - PRINT0(LD_IO, "call kmpc_master\n"); - return IsTeamMaster(global_tid); -} - -EXTERN void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid) { - PRINT0(LD_IO, "call kmpc_end_master\n"); - ASSERT0(LT_FUSSY, IsTeamMaster(global_tid), "expected only master here"); -} - -//////////////////////////////////////////////////////////////////////////////// -// KMP SINGLE -//////////////////////////////////////////////////////////////////////////////// - -EXTERN int32_t __kmpc_single(kmp_Ident *loc, int32_t global_tid) { - PRINT0(LD_IO, "call kmpc_single\n"); - // decide to implement single with master; master get the single - return IsTeamMaster(global_tid); -} - -EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid) { - PRINT0(LD_IO, "call kmpc_end_single\n"); - // decide to implement single with master: master get the single - ASSERT0(LT_FUSSY, IsTeamMaster(global_tid), "expected only master here"); - // sync barrier is explicitly called... so that is not a problem -} - -//////////////////////////////////////////////////////////////////////////////// -// Flush -//////////////////////////////////////////////////////////////////////////////// - -EXTERN void __kmpc_flush(kmp_Ident *loc) { - PRINT0(LD_IO, "call kmpc_flush\n"); - __kmpc_impl_threadfence(); -} - -//////////////////////////////////////////////////////////////////////////////// -// Vote -//////////////////////////////////////////////////////////////////////////////// - -EXTERN uint64_t __kmpc_warp_active_thread_mask(void) { - PRINT0(LD_IO, "call __kmpc_warp_active_thread_mask\n"); - return __kmpc_impl_activemask(); -} - -//////////////////////////////////////////////////////////////////////////////// -// Syncwarp -//////////////////////////////////////////////////////////////////////////////// - -EXTERN void __kmpc_syncwarp(uint64_t Mask) { - PRINT0(LD_IO, "call __kmpc_syncwarp\n"); - __kmpc_impl_syncwarp(Mask); -} - -#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/task.cu b/openmp/libomptarget/deviceRTLs/common/src/task.cu deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/task.cu +++ /dev/null @@ -1,219 +0,0 @@ -//===------------- task.h - NVPTX OpenMP tasks support ----------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Task implementation support. -// -// explicit task structure uses -// omptarget_nvptx task -// kmp_task -// -// where kmp_task is -// - klegacy_TaskDescr <- task pointer -// shared -> X -// routine -// part_id -// descr -// - private (of size given by task_alloc call). Accessed by -// task+sizeof(klegacy_TaskDescr) -// * private data * -// - shared: X. Accessed by shared ptr in klegacy_TaskDescr -// * pointer table to shared variables * -// - end -// -//===----------------------------------------------------------------------===// -#pragma omp declare target - -#include "common/omptarget.h" - -EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc( - kmp_Ident *loc, // unused - uint32_t global_tid, // unused - int32_t flag, // unused (because in our impl, all are immediately exec - size_t sizeOfTaskInclPrivate, size_t sizeOfSharedTable, - kmp_TaskFctPtr taskSub) { - PRINT(LD_IO, - "call __kmpc_omp_task_alloc(size priv&struct %lld, shared %lld, " - "fct 0x%llx)\n", - (long long)sizeOfTaskInclPrivate, (long long)sizeOfSharedTable, - (unsigned long long)taskSub); - // want task+priv to be a multiple of 8 bytes - size_t padForTaskInclPriv = PadBytes(sizeOfTaskInclPrivate, sizeof(void *)); - sizeOfTaskInclPrivate += padForTaskInclPriv; - size_t kmpSize = sizeOfTaskInclPrivate + sizeOfSharedTable; - ASSERT(LT_FUSSY, sizeof(omptarget_nvptx_TaskDescr) % sizeof(void *) == 0, - "need task descr of size %d to be a multiple of %d\n", - (int)sizeof(omptarget_nvptx_TaskDescr), (int)sizeof(void *)); - size_t totSize = sizeof(omptarget_nvptx_TaskDescr) + kmpSize; - omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = - (omptarget_nvptx_ExplicitTaskDescr *)SafeMalloc( - totSize, "explicit task descriptor"); - kmp_TaskDescr *newKmpTaskDescr = &newExplicitTaskDescr->kmpTaskDescr; - ASSERT0(LT_FUSSY, - (uint64_t)newKmpTaskDescr == - (uint64_t)ADD_BYTES(newExplicitTaskDescr, - sizeof(omptarget_nvptx_TaskDescr)), - "bad size assumptions"); - // init kmp_TaskDescr - newKmpTaskDescr->sharedPointerTable = - (void *)((char *)newKmpTaskDescr + sizeOfTaskInclPrivate); - newKmpTaskDescr->sub = taskSub; - newKmpTaskDescr->destructors = NULL; - PRINT(LD_TASK, "return with task descr kmp: 0x%llx, omptarget-nvptx 0x%llx\n", - (unsigned long long)newKmpTaskDescr, - (unsigned long long)newExplicitTaskDescr); - - return newKmpTaskDescr; -} - -EXTERN int32_t __kmpc_omp_task(kmp_Ident *loc, uint32_t global_tid, - kmp_TaskDescr *newKmpTaskDescr) { - return __kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0, - 0); -} - -EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid, - kmp_TaskDescr *newKmpTaskDescr, - int32_t depNum, void *depList, - int32_t noAliasDepNum, - void *noAliasDepList) { - PRINT(LD_IO, "call to __kmpc_omp_task_with_deps(task 0x%llx)\n", - P64(newKmpTaskDescr)); - ASSERT0(LT_FUSSY, isRuntimeInitialized(), - "Runtime must be initialized."); - // 1. get explicit task descr from kmp task descr - omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = - (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES( - newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr)); - ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr, - "bad assumptions"); - omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr; - ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr, - "bad assumptions"); - - // 2. push new context: update new task descriptor - int tid = GetLogicalThreadIdInBlock(); - omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid); - newTaskDescr->CopyForExplicitTask(parentTaskDescr); - // set new task descriptor as top - omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, newTaskDescr); - - // 3. call sub - PRINT(LD_TASK, "call task sub 0x%llx(task descr 0x%llx)\n", - (unsigned long long)newKmpTaskDescr->sub, - (unsigned long long)newKmpTaskDescr); - newKmpTaskDescr->sub(0, newKmpTaskDescr); - PRINT(LD_TASK, "return from call task sub 0x%llx()\n", - (unsigned long long)newKmpTaskDescr->sub); - - // 4. pop context - omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, - parentTaskDescr); - // 5. free - SafeFree(newExplicitTaskDescr, "explicit task descriptor"); - return 0; -} - -EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid, - kmp_TaskDescr *newKmpTaskDescr) { - PRINT(LD_IO, "call to __kmpc_omp_task_begin_if0(task 0x%llx)\n", - (unsigned long long)newKmpTaskDescr); - ASSERT0(LT_FUSSY, isRuntimeInitialized(), - "Runtime must be initialized."); - // 1. get explicit task descr from kmp task descr - omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = - (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES( - newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr)); - ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr, - "bad assumptions"); - omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr; - ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr, - "bad assumptions"); - - // 2. push new context: update new task descriptor - int tid = GetLogicalThreadIdInBlock(); - omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid); - newTaskDescr->CopyForExplicitTask(parentTaskDescr); - // set new task descriptor as top - omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, newTaskDescr); - // 3... noting to call... is inline - // 4 & 5 ... done in complete -} - -EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid, - kmp_TaskDescr *newKmpTaskDescr) { - PRINT(LD_IO, "call to __kmpc_omp_task_complete_if0(task 0x%llx)\n", - (unsigned long long)newKmpTaskDescr); - ASSERT0(LT_FUSSY, isRuntimeInitialized(), - "Runtime must be initialized."); - // 1. get explicit task descr from kmp task descr - omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = - (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES( - newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr)); - ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr, - "bad assumptions"); - omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr; - ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr, - "bad assumptions"); - // 2. get parent - omptarget_nvptx_TaskDescr *parentTaskDescr = newTaskDescr->GetPrevTaskDescr(); - // 3... noting to call... is inline - // 4. pop context - int tid = GetLogicalThreadIdInBlock(); - omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, - parentTaskDescr); - // 5. free - SafeFree(newExplicitTaskDescr, "explicit task descriptor"); -} - -EXTERN void __kmpc_omp_wait_deps(kmp_Ident *loc, uint32_t global_tid, - int32_t depNum, void *depList, - int32_t noAliasDepNum, void *noAliasDepList) { - PRINT0(LD_IO, "call to __kmpc_omp_wait_deps(..)\n"); - // nothing to do as all our tasks are executed as final -} - -EXTERN void __kmpc_taskgroup(kmp_Ident *loc, uint32_t global_tid) { - PRINT0(LD_IO, "call to __kmpc_taskgroup(..)\n"); - // nothing to do as all our tasks are executed as final -} - -EXTERN void __kmpc_end_taskgroup(kmp_Ident *loc, uint32_t global_tid) { - PRINT0(LD_IO, "call to __kmpc_end_taskgroup(..)\n"); - // nothing to do as all our tasks are executed as final -} - -EXTERN int32_t __kmpc_omp_taskyield(kmp_Ident *loc, uint32_t global_tid, - int end_part) { - PRINT0(LD_IO, "call to __kmpc_taskyield()\n"); - // do nothing: tasks are executed immediately, no yielding allowed - return 0; -} - -EXTERN int32_t __kmpc_omp_taskwait(kmp_Ident *loc, uint32_t global_tid) { - PRINT0(LD_IO, "call to __kmpc_taskwait()\n"); - // nothing to do as all our tasks are executed as final - return 0; -} - -EXTERN void __kmpc_taskloop(kmp_Ident *loc, uint32_t global_tid, - kmp_TaskDescr *newKmpTaskDescr, int if_val, - uint64_t *lb, uint64_t *ub, int64_t st, int nogroup, - int32_t sched, uint64_t grainsize, void *task_dup) { - - // skip task entirely if empty iteration space - if (*lb > *ub) - return; - - // the compiler has already stored lb and ub in the kmp_TaskDescr structure - // as we are using a single task to execute the entire loop, we can leave - // the initial task_t untouched - - __kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0, 0); -} - -#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/state-queue.h b/openmp/libomptarget/deviceRTLs/common/state-queue.h deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/common/state-queue.h +++ /dev/null @@ -1,51 +0,0 @@ -//===--------- statequeue.h - NVPTX OpenMP GPU State Queue ------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains a queue to hand out OpenMP state objects to teams of -// one or more kernels. -// -// Reference: -// Thomas R.W. Scogland and Wu-chun Feng. 2015. -// Design and Evaluation of Scalable Concurrent Queues for Many-Core -// Architectures. International Conference on Performance Engineering. -// -//===----------------------------------------------------------------------===// - -#ifndef __STATE_QUEUE_H -#define __STATE_QUEUE_H - -#include - -#include "target_impl.h" - -template class omptarget_nvptx_Queue { -private: - ElementType elements[SIZE]; - volatile ElementType *elementQueue[SIZE]; - volatile uint32_t head; - volatile uint32_t ids[SIZE]; - volatile uint32_t tail; - - static const uint32_t MAX_ID = (1u << 31) / SIZE / 2; - INLINE uint32_t ENQUEUE_TICKET(); - INLINE uint32_t DEQUEUE_TICKET(); - INLINE static uint32_t ID(uint32_t ticket); - INLINE bool IsServing(uint32_t slot, uint32_t id); - INLINE void PushElement(uint32_t slot, ElementType *element); - INLINE ElementType *PopElement(uint32_t slot); - INLINE void DoneServing(uint32_t slot, uint32_t id); - -public: - INLINE omptarget_nvptx_Queue() {} - INLINE void Enqueue(ElementType *element); - INLINE ElementType *Dequeue(); -}; - -#include "state-queuei.h" - -#endif diff --git a/openmp/libomptarget/deviceRTLs/common/state-queuei.h b/openmp/libomptarget/deviceRTLs/common/state-queuei.h deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/common/state-queuei.h +++ /dev/null @@ -1,88 +0,0 @@ -//===------- state-queuei.h - OpenMP GPU State Queue ------------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the implementation of a queue to hand out OpenMP state -// objects to teams of one or more kernels. -// -// Reference: -// Thomas R.W. Scogland and Wu-chun Feng. 2015. -// Design and Evaluation of Scalable Concurrent Queues for Many-Core -// Architectures. International Conference on Performance Engineering. -// -//===----------------------------------------------------------------------===// - -#include "state-queue.h" - -template -INLINE uint32_t omptarget_nvptx_Queue::ENQUEUE_TICKET() { - return __kmpc_atomic_add((unsigned int *)&tail, 1u); -} - -template -INLINE uint32_t omptarget_nvptx_Queue::DEQUEUE_TICKET() { - return __kmpc_atomic_add((unsigned int *)&head, 1u); -} - -template -INLINE uint32_t omptarget_nvptx_Queue::ID(uint32_t ticket) { - return (ticket / SIZE) * 2; -} - -template -INLINE bool omptarget_nvptx_Queue::IsServing(uint32_t slot, - uint32_t id) { - return __kmpc_atomic_add((unsigned int *)&ids[slot], 0u) == id; -} - -template -INLINE void -omptarget_nvptx_Queue::PushElement(uint32_t slot, - ElementType *element) { - __kmpc_atomic_exchange((unsigned long long *)&elementQueue[slot], - (unsigned long long)element); -} - -template -INLINE ElementType * -omptarget_nvptx_Queue::PopElement(uint32_t slot) { - return (ElementType *)__kmpc_atomic_add( - (unsigned long long *)&elementQueue[slot], (unsigned long long)0); -} - -template -INLINE void omptarget_nvptx_Queue::DoneServing(uint32_t slot, - uint32_t id) { - __kmpc_atomic_exchange((unsigned int *)&ids[slot], (id + 1) % MAX_ID); -} - -template -INLINE void -omptarget_nvptx_Queue::Enqueue(ElementType *element) { - uint32_t ticket = ENQUEUE_TICKET(); - uint32_t slot = ticket % SIZE; - uint32_t id = ID(ticket) + 1; - while (!IsServing(slot, id)) - ; - PushElement(slot, element); - DoneServing(slot, id); -} - -template -INLINE ElementType *omptarget_nvptx_Queue::Dequeue() { - uint32_t ticket = DEQUEUE_TICKET(); - uint32_t slot = ticket % SIZE; - uint32_t id = ID(ticket); - while (!IsServing(slot, id)) - ; - ElementType *element = PopElement(slot); - // This is to populate the queue because of the lack of GPU constructors. - if (element == 0) - element = &elements[slot]; - DoneServing(slot, id); - return element; -} diff --git a/openmp/libomptarget/deviceRTLs/common/support.h b/openmp/libomptarget/deviceRTLs/common/support.h deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/common/support.h +++ /dev/null @@ -1,91 +0,0 @@ -//===--------- support.h - OpenMP GPU support functions ---------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Wrapper to some functions natively supported by the GPU. -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_SUPPORT_H -#define OMPTARGET_SUPPORT_H - -#include "interface.h" -#include "target_impl.h" - -//////////////////////////////////////////////////////////////////////////////// -// Execution Parameters -//////////////////////////////////////////////////////////////////////////////// -enum OMPTgtExecModeFlags : int8_t { - OMP_TGT_EXEC_MODE_GENERIC = 1 << 0, - OMP_TGT_EXEC_MODE_SPMD = 1 << 1 -}; - -enum OMPTgtRuntimeModeFlags : int8_t { - OMP_TGT_RUNTIME_UNINITIALIZED = 0, - OMP_TGT_RUNTIME_INITIALIZED = 1 << 2 -}; - -void setExecutionParameters(OMPTgtExecModeFlags EMode, - OMPTgtRuntimeModeFlags RMode); -bool isGenericMode(); -bool isRuntimeUninitialized(); -bool isRuntimeInitialized(); - -//////////////////////////////////////////////////////////////////////////////// -// get info from machine -//////////////////////////////////////////////////////////////////////////////// - -// get global ids to locate tread/team info (constant regardless of OMP) -int GetLogicalThreadIdInBlock(); -int GetMasterThreadID(); -int GetNumberOfWorkersInTeam(); - -// get OpenMP thread and team ids -int GetOmpThreadId(); // omp_thread_num -int GetOmpTeamId(); // omp_team_num - -// get OpenMP number of threads and team -int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads -int GetNumberOfOmpTeams(); // omp_num_teams - -// get OpenMP number of procs -int GetNumberOfProcsInTeam(bool isSPMDExecutionMode); -int GetNumberOfProcsInDevice(bool isSPMDExecutionMode); - -// masters -int IsTeamMaster(int ompThreadId); - -// Parallel level -void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask); -void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask); - -//////////////////////////////////////////////////////////////////////////////// -// Memory -//////////////////////////////////////////////////////////////////////////////// - -// safe alloc and free -void *SafeMalloc(size_t size, const char *msg); // check if success -void *SafeFree(void *ptr, const char *msg); -// pad to a alignment (power of 2 only) -unsigned long PadBytes(unsigned long size, unsigned long alignment); -#define ADD_BYTES(_addr, _bytes) \ - ((void *)((char *)((void *)(_addr)) + (_bytes))) -#define SUB_BYTES(_addr, _bytes) \ - ((void *)((char *)((void *)(_addr)) - (_bytes))) - -//////////////////////////////////////////////////////////////////////////////// -// Teams Reduction Scratchpad Helpers -//////////////////////////////////////////////////////////////////////////////// -unsigned int *GetTeamsReductionTimestamp(); -char *GetTeamsReductionScratchpad(); - -// Invoke an outlined parallel function unwrapping global, shared arguments (up -// to 128). -void __kmp_invoke_microtask(kmp_int32 global_tid, kmp_int32 bound_tid, void *fn, - void **args, size_t nargs); - -#endif diff --git a/openmp/libomptarget/deviceRTLs/interface.h b/openmp/libomptarget/deviceRTLs/interface.h deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/interface.h +++ /dev/null @@ -1,505 +0,0 @@ -//===------- interface.h - OpenMP interface definitions ---------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains all the definitions that are relevant to -// the interface. The first section contains the interface as -// declared by OpenMP. The second section includes the compiler -// specific interfaces. -// -//===----------------------------------------------------------------------===// - -#ifndef _INTERFACES_H_ -#define _INTERFACES_H_ - -#include -#include - -#ifdef __AMDGCN__ -#include "amdgcn/src/amdgcn_interface.h" -#endif -#ifdef __CUDACC__ -#include "nvptx/src/nvptx_interface.h" -#endif - -//////////////////////////////////////////////////////////////////////////////// -// OpenMP interface -//////////////////////////////////////////////////////////////////////////////// - -typedef uint64_t omp_nest_lock_t; /* arbitrary type of the right length */ - -typedef enum omp_sched_t { - omp_sched_static = 1, /* chunkSize >0 */ - omp_sched_dynamic = 2, /* chunkSize >0 */ - omp_sched_guided = 3, /* chunkSize >0 */ - omp_sched_auto = 4, /* no chunkSize */ -} omp_sched_t; - -typedef enum omp_proc_bind_t { - omp_proc_bind_false = 0, - omp_proc_bind_true = 1, - omp_proc_bind_master = 2, - omp_proc_bind_close = 3, - omp_proc_bind_spread = 4 -} omp_proc_bind_t; - -EXTERN double omp_get_wtick(void); -EXTERN double omp_get_wtime(void); - -EXTERN void omp_set_num_threads(int num); -EXTERN int omp_get_num_threads(void); -EXTERN int omp_get_max_threads(void); -EXTERN int omp_get_thread_limit(void); -EXTERN int omp_get_thread_num(void); -EXTERN int omp_get_num_procs(void); -EXTERN int omp_in_parallel(void); -EXTERN int omp_in_final(void); -EXTERN void omp_set_dynamic(int flag); -EXTERN int omp_get_dynamic(void); -EXTERN void omp_set_nested(int flag); -EXTERN int omp_get_nested(void); -EXTERN void omp_set_max_active_levels(int level); -EXTERN int omp_get_max_active_levels(void); -EXTERN int omp_get_level(void); -EXTERN int omp_get_active_level(void); -EXTERN int omp_get_ancestor_thread_num(int level); -EXTERN int omp_get_team_size(int level); - -EXTERN void omp_init_lock(omp_lock_t *lock); -EXTERN void omp_init_nest_lock(omp_nest_lock_t *lock); -EXTERN void omp_destroy_lock(omp_lock_t *lock); -EXTERN void omp_destroy_nest_lock(omp_nest_lock_t *lock); -EXTERN void omp_set_lock(omp_lock_t *lock); -EXTERN void omp_set_nest_lock(omp_nest_lock_t *lock); -EXTERN void omp_unset_lock(omp_lock_t *lock); -EXTERN void omp_unset_nest_lock(omp_nest_lock_t *lock); -EXTERN int omp_test_lock(omp_lock_t *lock); -EXTERN int omp_test_nest_lock(omp_nest_lock_t *lock); - -EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier); -EXTERN void omp_set_schedule(omp_sched_t kind, int modifier); -EXTERN omp_proc_bind_t omp_get_proc_bind(void); -EXTERN int omp_get_cancellation(void); -EXTERN void omp_set_default_device(int deviceId); -EXTERN int omp_get_default_device(void); -EXTERN int omp_get_num_devices(void); -EXTERN int omp_get_num_teams(void); -EXTERN int omp_get_team_num(void); -EXTERN int omp_get_initial_device(void); -EXTERN int omp_get_max_task_priority(void); - -EXTERN void *llvm_omp_get_dynamic_shared(); - -//////////////////////////////////////////////////////////////////////////////// -// file below is swiped from kmpc host interface -//////////////////////////////////////////////////////////////////////////////// - -//////////////////////////////////////////////////////////////////////////////// -// kmp specific types -//////////////////////////////////////////////////////////////////////////////// - -typedef enum kmp_sched_t { - kmp_sched_static_chunk = 33, - kmp_sched_static_nochunk = 34, - kmp_sched_dynamic = 35, - kmp_sched_guided = 36, - kmp_sched_runtime = 37, - kmp_sched_auto = 38, - - kmp_sched_static_balanced_chunk = 45, - - kmp_sched_static_ordered = 65, - kmp_sched_static_nochunk_ordered = 66, - kmp_sched_dynamic_ordered = 67, - kmp_sched_guided_ordered = 68, - kmp_sched_runtime_ordered = 69, - kmp_sched_auto_ordered = 70, - - kmp_sched_distr_static_chunk = 91, - kmp_sched_distr_static_nochunk = 92, - kmp_sched_distr_static_chunk_sched_static_chunkone = 93, - - kmp_sched_default = kmp_sched_static_nochunk, - kmp_sched_unordered_first = kmp_sched_static_chunk, - kmp_sched_unordered_last = kmp_sched_auto, - kmp_sched_ordered_first = kmp_sched_static_ordered, - kmp_sched_ordered_last = kmp_sched_auto_ordered, - kmp_sched_distribute_first = kmp_sched_distr_static_chunk, - kmp_sched_distribute_last = - kmp_sched_distr_static_chunk_sched_static_chunkone, - - /* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers. - * Since we need to distinguish the three possible cases (no modifier, - * monotonic modifier, nonmonotonic modifier), we need separate bits for - * each modifier. The absence of monotonic does not imply nonmonotonic, - * especially since 4.5 says that the behaviour of the "no modifier" case - * is implementation defined in 4.5, but will become "nonmonotonic" in 5.0. - * - * Since we're passing a full 32 bit value, we can use a couple of high - * bits for these flags; out of paranoia we avoid the sign bit. - * - * These modifiers can be or-ed into non-static schedules by the compiler - * to pass the additional information. They will be stripped early in the - * processing in __kmp_dispatch_init when setting up schedules, so - * most of the code won't ever see schedules with these bits set. - */ - kmp_sched_modifier_monotonic = (1 << 29), - /**< Set if the monotonic schedule modifier was present */ - kmp_sched_modifier_nonmonotonic = (1 << 30), -/**< Set if the nonmonotonic schedule modifier was present */ - -#define SCHEDULE_WITHOUT_MODIFIERS(s) \ - (enum kmp_sched_t)( \ - (s) & ~(kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) -#define SCHEDULE_HAS_MONOTONIC(s) (((s)&kmp_sched_modifier_monotonic) != 0) -#define SCHEDULE_HAS_NONMONOTONIC(s) \ - (((s)&kmp_sched_modifier_nonmonotonic) != 0) -#define SCHEDULE_HAS_NO_MODIFIERS(s) \ - (((s) & (kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) == \ - 0) - -} kmp_sched_t; - -/*! - * Enum for accesseing the reserved_2 field of the ident_t struct below. - */ -enum { - /*! Bit set to 1 when in SPMD mode. */ - KMP_IDENT_SPMD_MODE = 0x01, - /*! Bit set to 1 when a simplified runtime is used. */ - KMP_IDENT_SIMPLE_RT_MODE = 0x02, -}; - -/*! - * The ident structure that describes a source location. - * The struct is identical to the one in the kmp.h file. - * We maintain the same data structure for compatibility. - */ -typedef short kmp_int16; -typedef int kmp_int32; -typedef struct ident { - kmp_int32 reserved_1; /**< might be used in Fortran; see above */ - kmp_int32 flags; /**< also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC - identifies this union member */ - kmp_int32 reserved_2; /**< not really used in Fortran any more; see above */ - kmp_int32 reserved_3; /**< source[4] in Fortran, do not use for C++ */ - char const *psource; /**< String describing the source location. - The string is composed of semi-colon separated fields - which describe the source file, the function and a pair - of line numbers that delimit the construct. */ -} ident_t; - -// parallel defs -typedef ident_t kmp_Ident; -typedef void (*kmp_InterWarpCopyFctPtr)(void *src, int32_t warp_num); -typedef void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, - int16_t lane_offset, - int16_t shortCircuit); -typedef void (*kmp_ListGlobalFctPtr)(void *buffer, int idx, void *reduce_data); - -// task defs -typedef struct kmp_TaskDescr kmp_TaskDescr; -typedef int32_t (*kmp_TaskFctPtr)(int32_t global_tid, kmp_TaskDescr *taskDescr); -typedef struct kmp_TaskDescr { - void *sharedPointerTable; // ptr to a table of shared var ptrs - kmp_TaskFctPtr sub; // task subroutine - int32_t partId; // unused - kmp_TaskFctPtr destructors; // destructor of c++ first private -} kmp_TaskDescr; - -// sync defs -typedef int32_t kmp_CriticalName[8]; - -//////////////////////////////////////////////////////////////////////////////// -// external interface -//////////////////////////////////////////////////////////////////////////////// - -// parallel -EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc); -NOINLINE EXTERN uint8_t __kmpc_parallel_level(); - -// proc bind -EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t global_tid, - int proc_bind); -EXTERN int omp_get_num_places(void); -EXTERN int omp_get_place_num_procs(int place_num); -EXTERN void omp_get_place_proc_ids(int place_num, int *ids); -EXTERN int omp_get_place_num(void); -EXTERN int omp_get_partition_num_places(void); -EXTERN void omp_get_partition_place_nums(int *place_nums); - -// for static (no chunk or chunk) -EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid, - int32_t sched, int32_t *plastiter, - int32_t *plower, int32_t *pupper, - int32_t *pstride, int32_t incr, - int32_t chunk); -EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid, - int32_t sched, int32_t *plastiter, - uint32_t *plower, uint32_t *pupper, - int32_t *pstride, int32_t incr, - int32_t chunk); -EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid, - int32_t sched, int32_t *plastiter, - int64_t *plower, int64_t *pupper, - int64_t *pstride, int64_t incr, - int64_t chunk); -EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid, - int32_t sched, int32_t *plastiter1, - uint64_t *plower, uint64_t *pupper, - int64_t *pstride, int64_t incr, - int64_t chunk); -// distribute static (no chunk or chunk) -EXTERN void __kmpc_distribute_static_init_4(kmp_Ident *loc, int32_t global_tid, - int32_t sched, int32_t *plastiter, - int32_t *plower, int32_t *pupper, - int32_t *pstride, int32_t incr, - int32_t chunk); -EXTERN void __kmpc_distribute_static_init_4u(kmp_Ident *loc, int32_t global_tid, - int32_t sched, int32_t *plastiter, - uint32_t *plower, uint32_t *pupper, - int32_t *pstride, int32_t incr, - int32_t chunk); -EXTERN void __kmpc_distribute_static_init_8(kmp_Ident *loc, int32_t global_tid, - int32_t sched, int32_t *plastiter, - int64_t *plower, int64_t *pupper, - int64_t *pstride, int64_t incr, - int64_t chunk); -EXTERN void __kmpc_distribute_static_init_8u(kmp_Ident *loc, int32_t global_tid, - int32_t sched, int32_t *plastiter1, - uint64_t *plower, uint64_t *pupper, - int64_t *pstride, int64_t incr, - int64_t chunk); -EXTERN -void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid, - int32_t sched, int32_t *plastiter, - int32_t *plower, int32_t *pupper, - int32_t *pstride, int32_t incr, - int32_t chunk); -EXTERN -void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid, - int32_t sched, int32_t *plastiter, - uint32_t *plower, uint32_t *pupper, - int32_t *pstride, int32_t incr, - int32_t chunk); -EXTERN -void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid, - int32_t sched, int32_t *plastiter, - int64_t *plower, int64_t *pupper, - int64_t *pstride, int64_t incr, - int64_t chunk); -EXTERN -void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid, - int32_t sched, int32_t *plastiter1, - uint64_t *plower, uint64_t *pupper, - int64_t *pstride, int64_t incr, - int64_t chunk); -EXTERN -void __kmpc_for_static_init_4_simple_generic(kmp_Ident *loc, int32_t global_tid, - int32_t sched, int32_t *plastiter, - int32_t *plower, int32_t *pupper, - int32_t *pstride, int32_t incr, - int32_t chunk); -EXTERN -void __kmpc_for_static_init_4u_simple_generic( - kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter, - uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr, - int32_t chunk); -EXTERN -void __kmpc_for_static_init_8_simple_generic(kmp_Ident *loc, int32_t global_tid, - int32_t sched, int32_t *plastiter, - int64_t *plower, int64_t *pupper, - int64_t *pstride, int64_t incr, - int64_t chunk); -EXTERN -void __kmpc_for_static_init_8u_simple_generic( - kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter1, - uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr, - int64_t chunk); - -EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid); - -EXTERN void __kmpc_distribute_static_fini(kmp_Ident *loc, int32_t global_tid); - -// for dynamic -EXTERN void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t global_tid, - int32_t sched, int32_t lower, int32_t upper, - int32_t incr, int32_t chunk); -EXTERN void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t global_tid, - int32_t sched, uint32_t lower, - uint32_t upper, int32_t incr, - int32_t chunk); -EXTERN void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t global_tid, - int32_t sched, int64_t lower, int64_t upper, - int64_t incr, int64_t chunk); -EXTERN void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t global_tid, - int32_t sched, uint64_t lower, - uint64_t upper, int64_t incr, - int64_t chunk); - -EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t global_tid, - int32_t *plastiter, int32_t *plower, - int32_t *pupper, int32_t *pstride); -EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t global_tid, - int32_t *plastiter, uint32_t *plower, - uint32_t *pupper, int32_t *pstride); -EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t global_tid, - int32_t *plastiter, int64_t *plower, - int64_t *pupper, int64_t *pstride); -EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t global_tid, - int32_t *plastiter, uint64_t *plower, - uint64_t *pupper, int64_t *pstride); - -EXTERN void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t global_tid); -EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t global_tid); -EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t global_tid); -EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t global_tid); - -// reduction -EXTERN void __kmpc_nvptx_end_reduce(int32_t global_tid); -EXTERN void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid); -EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_v2( - kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size, - void *reduce_data, kmp_ShuffleReductFctPtr shflFct, - kmp_InterWarpCopyFctPtr cpyFct); -EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2( - kmp_Ident *loc, int32_t global_tid, void *global_buffer, - int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct, - kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct, - kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct, - kmp_ListGlobalFctPtr glredFct); -EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size); -EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size); - -// sync barrier -EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid); -EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid); -EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid); -EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc, int32_t global_tid); - -// single -EXTERN int32_t __kmpc_single(kmp_Ident *loc, int32_t global_tid); -EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid); - -// sync -EXTERN int32_t __kmpc_master(kmp_Ident *loc, int32_t global_tid); -EXTERN void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid); -EXTERN void __kmpc_ordered(kmp_Ident *loc, int32_t global_tid); -EXTERN void __kmpc_end_ordered(kmp_Ident *loc, int32_t global_tid); -EXTERN void __kmpc_critical(kmp_Ident *loc, int32_t global_tid, - kmp_CriticalName *crit); -EXTERN void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid, - kmp_CriticalName *crit); -EXTERN void __kmpc_flush(kmp_Ident *loc); - -// vote -EXTERN uint64_t __kmpc_warp_active_thread_mask(void); -// syncwarp -EXTERN void __kmpc_syncwarp(uint64_t); - -// tasks -EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(kmp_Ident *loc, uint32_t global_tid, - int32_t flag, - size_t sizeOfTaskInclPrivate, - size_t sizeOfSharedTable, - kmp_TaskFctPtr sub); -EXTERN int32_t __kmpc_omp_task(kmp_Ident *loc, uint32_t global_tid, - kmp_TaskDescr *newLegacyTaskDescr); -EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid, - kmp_TaskDescr *newLegacyTaskDescr, - int32_t depNum, void *depList, - int32_t noAliasDepNum, - void *noAliasDepList); -EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid, - kmp_TaskDescr *newLegacyTaskDescr); -EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid, - kmp_TaskDescr *newLegacyTaskDescr); -EXTERN void __kmpc_omp_wait_deps(kmp_Ident *loc, uint32_t global_tid, - int32_t depNum, void *depList, - int32_t noAliasDepNum, void *noAliasDepList); -EXTERN void __kmpc_taskgroup(kmp_Ident *loc, uint32_t global_tid); -EXTERN void __kmpc_end_taskgroup(kmp_Ident *loc, uint32_t global_tid); -EXTERN int32_t __kmpc_omp_taskyield(kmp_Ident *loc, uint32_t global_tid, - int end_part); -EXTERN int32_t __kmpc_omp_taskwait(kmp_Ident *loc, uint32_t global_tid); -EXTERN void __kmpc_taskloop(kmp_Ident *loc, uint32_t global_tid, - kmp_TaskDescr *newKmpTaskDescr, int if_val, - uint64_t *lb, uint64_t *ub, int64_t st, int nogroup, - int32_t sched, uint64_t grainsize, void *task_dup); - -// cancel -EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid, - int32_t cancelVal); -EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid, - int32_t cancelVal); - -// non standard -EXTERN int32_t __kmpc_target_init(ident_t *Ident, int8_t Mode, - bool UseGenericStateMachine, - bool RequiresFullRuntime); -EXTERN void __kmpc_target_deinit(ident_t *Ident, int8_t Mode, - bool RequiresFullRuntime); -EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, - int32_t NumThreadsClause); -EXTERN bool __kmpc_kernel_parallel(void **WorkFn); -EXTERN void __kmpc_kernel_end_parallel(); - -EXTERN void __kmpc_data_sharing_init_stack(); -EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs); -EXTERN void __kmpc_end_sharing_variables(); -EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs); - -/// Entry point to start a new parallel region. -/// -/// \param ident The source identifier. -/// \param global_tid The global thread ID. -/// \param if_expr The if(expr), or 1 if none given. -/// \param num_threads The num_threads(expr), or -1 if none given. -/// \param proc_bind The proc_bind, or `proc_bind_default` if none given. -/// \param fn The outlined parallel region function. -/// \param wrapper_fn The worker wrapper function of fn. -/// \param args The pointer array of arguments to fn. -/// \param nargs The number of arguments to fn. -NOINLINE EXTERN void __kmpc_parallel_51(ident_t *ident, kmp_int32 global_tid, - kmp_int32 if_expr, - kmp_int32 num_threads, int proc_bind, - void *fn, void *wrapper_fn, void **args, - size_t nargs); - -// SPMD execution mode interrogation function. -EXTERN int8_t __kmpc_is_spmd_exec_mode(); - -/// Return true if the hardware thread id \p Tid represents the OpenMP main -/// thread in generic mode outside of a parallel region. -EXTERN int8_t __kmpc_is_generic_main_thread(kmp_int32 Tid); - -/// Return true if the hardware thread id \p Tid represents the OpenMP main -/// thread in generic mode. -EXTERN int8_t __kmpc_is_generic_main_thread_id(kmp_int32 Tid); - -EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode, - const void *buf, size_t size, - int16_t is_shared, const void **res); - -EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode, - int16_t is_shared); - -/// Allocate \p Bytes in "shareable" memory and return the address. Needs to be -/// called balanced with __kmpc_free_shared like a stack (push/pop). Can be -/// called by any thread, allocation happens per-thread. -EXTERN void *__kmpc_alloc_shared(uint64_t Bytes); - -/// Deallocate \p Ptr. Needs to be called balanced with __kmpc_alloc_shared like -/// a stack (push/pop). Can be called by any thread. \p Ptr must be allocated by -/// __kmpc_alloc_shared by the same thread. \p Bytes contains the size of the -/// paired allocation to make memory management easier. -EXTERN void __kmpc_free_shared(void *Ptr, size_t Bytes); - -/// Get a pointer to the dynamic shared memory buffer in the device. -EXTERN void *__kmpc_get_dynamic_shared(); - -#endif diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt +++ /dev/null @@ -1,257 +0,0 @@ -##===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -##===----------------------------------------------------------------------===## -# -# Build the NVPTX (CUDA) Device RTL if the CUDA tools are available -# -##===----------------------------------------------------------------------===## - -# By default we will build NVPTX deviceRTL on a CUDA free system -set(LIBOMPTARGET_BUILD_NVPTX_BCLIB FALSE CACHE BOOL - "Whether build NVPTX deviceRTL on CUDA free system.") - -if (NOT LIBOMPTARGET_BUILD_NVPTX_BCLIB) - libomptarget_say("Not building NVPTX deviceRTL: Disabled by LIBOMPTARGET_BUILD_NVPTX_BCLIB") - return() -endif() - -if (NOT LIBOMPTARGET_LLVM_INCLUDE_DIRS) - libomptarget_say("Not building NVPTX device RTL: Missing definition for LIBOMPTARGET_LLVM_INCLUDE_DIRS") - return() -endif() - -# Check if we can create an LLVM bitcode implementation of the runtime library -# that could be inlined in the user application. For that we need to find -# a Clang compiler capable of compiling our CUDA files to LLVM bitcode and -# an LLVM linker. -set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING - "Location of a CUDA compiler capable of emitting LLVM bitcode.") -set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING - "Location of a linker capable of linking LLVM bitcode objects.") - -if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "") - set(cuda_compiler ${LIBOMPTARGET_NVPTX_CUDA_COMPILER}) -elseif (LLVM_TOOL_CLANG_BUILD AND NOT CMAKE_CROSSCOMPILING) - # Compile the deviceRTL with the clang that is built in the project. - set(cuda_compiler "$") -elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang") - # Compile the device runtime with the compiler that OpenMP is built with. - # This is the case with LLVM_ENABLE_RUNTIMES=openmp. - # FIXME: This is unreliable; the compiler can be on older version of clang - # that does not support compiling CUDA, or only an older version of it. The - # risk is especially high on sytems where clang is the default compiler - # (MacOS, BSDs). LLVM_ENABLE_RUNTIMES=openmp should itself set - # LIBOMPTARGET_NVPTX_CUDA_COMPILER instead. - set(cuda_compiler ${CMAKE_C_COMPILER}) -else() - libomptarget_say("Not building NVPTX deviceRTL: clang not found") - return() -endif() - -# Get compiler directory to try to locate a suitable linker. -get_filename_component(compiler_dir ${cuda_compiler} DIRECTORY) -set(llvm_link "${compiler_dir}/llvm-link") - -if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "") - set(bc_linker ${LIBOMPTARGET_NVPTX_BC_LINKER}) -elseif (EXISTS ${llvm_link}) - # Try to use the linker consistent with the CUDA compiler unless explicitly - # set to a different linker. - set(bc_linker ${llvm_link}) -elseif (NOT OPENMP_STANDALONE_BUILD AND NOT CMAKE_CROSSCOMPILING) - # Use the linker also built in the same project. - set(bc_linker "$") -else() - libomptarget_say("Not building NVPTX deviceRTL: llvm-link not found") - return() -endif() - -# TODO: This part needs to be refined when libomptarget is going to support -# Windows! -# TODO: This part can also be removed if we can change the clang driver to make -# it support device only compilation. -if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64") - set(aux_triple x86_64-unknown-linux-gnu) -elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ppc64le") - set(aux_triple powerpc64le-unknown-linux-gnu) -elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64") - set(aux_triple aarch64-unknown-linux-gnu) -else() - libomptarget_say("Not building CUDA offloading device RTL: unknown host arch: ${CMAKE_HOST_SYSTEM_PROCESSOR}") - return() -endif() - -get_filename_component(devicertl_base_directory - ${CMAKE_CURRENT_SOURCE_DIR} - DIRECTORY) -set(devicertl_common_directory - ${devicertl_base_directory}/common) -set(devicertl_nvptx_directory - ${devicertl_base_directory}/nvptx) - -set(all_capabilities 35 37 50 52 53 60 61 62 70 72 75 80 86) - -set(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES ${all_capabilities} CACHE STRING - "List of CUDA Compute Capabilities to be used to compile the NVPTX device RTL.") -string(TOLOWER ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES} LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES) - -if (LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES STREQUAL "all") - set(nvptx_sm_list ${all_capabilities}) -elseif(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES STREQUAL "auto") - if (NOT LIBOMPTARGET_DEP_CUDA_FOUND) - libomptarget_error_say("[NVPTX] Cannot auto detect compute capability as CUDA not found.") - endif() - set(nvptx_sm_list ${LIBOMPTARGET_DEP_CUDA_ARCH}) -else() - string(REPLACE "," ";" nvptx_sm_list "${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES}") -endif() - -# If user set LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES to empty, we disable the -# build. -if (NOT nvptx_sm_list) - libomptarget_say("Not building CUDA offloading device RTL: empty compute capability list") - return() -endif() - -# Check all SM values -foreach(sm ${nvptx_sm_list}) - if (NOT ${sm} IN_LIST all_capabilities) - libomptarget_warning_say("[NVPTX] Compute capability ${sm} is not supported. Make sure clang can work with it.") - endif() -endforeach() - -# Override default MAX_SM in src/target_impl.h if requested -if (DEFINED LIBOMPTARGET_NVPTX_MAX_SM) - set(MAX_SM_DEFINITION "-DMAX_SM=${LIBOMPTARGET_NVPTX_MAX_SM}") -endif() - -# Activate RTL message dumps if requested by the user. -set(LIBOMPTARGET_NVPTX_DEBUG FALSE CACHE BOOL - "Activate NVPTX device RTL debug messages.") - -if ("${cuda_compiler}" STREQUAL "$") - libomptarget_say("Building CUDA LLVM bitcode offloading device RTL using in-tree clang.") -else () - libomptarget_say("Building CUDA LLVM bitcode offloading device RTL using ${cuda_compiler}") -endif () - -set(cuda_src_files - ${devicertl_common_directory}/src/cancel.cu - ${devicertl_common_directory}/src/critical.cu - ${devicertl_common_directory}/src/data_sharing.cu - ${devicertl_common_directory}/src/libcall.cu - ${devicertl_common_directory}/src/loop.cu - ${devicertl_common_directory}/src/omp_data.cu - ${devicertl_common_directory}/src/omptarget.cu - ${devicertl_common_directory}/src/parallel.cu - ${devicertl_common_directory}/src/reduction.cu - ${devicertl_common_directory}/src/support.cu - ${devicertl_common_directory}/src/sync.cu - ${devicertl_common_directory}/src/task.cu - ${devicertl_common_directory}/src/shuffle.cpp - src/target_impl.cu -) - -# Prepend -I to each list element -set (LIBOMPTARGET_LLVM_INCLUDE_DIRS_NVPTX "${LIBOMPTARGET_LLVM_INCLUDE_DIRS}") -list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_NVPTX PREPEND "-I") - -# Set flags for LLVM Bitcode compilation. -set(bc_flags -S -x c++ -O1 -std=c++14 - -mllvm -openmp-opt-disable - -ffreestanding - -target nvptx64 - -fvisibility=hidden - -Xclang -emit-llvm-bc - -Xclang -aux-triple -Xclang ${aux_triple} - -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device - -Xclang -target-feature -Xclang +ptx61 - -D__CUDACC__ - -I${devicertl_base_directory} - -I${devicertl_common_directory}/include - -I${devicertl_nvptx_directory}/src - -I${devicertl_base_directory}/../include - ${LIBOMPTARGET_LLVM_INCLUDE_DIRS_NVPTX}) - -if(${LIBOMPTARGET_NVPTX_DEBUG}) - list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=-1 -g) -else() - list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=0) -endif() - -# Create target to build all Bitcode libraries. -add_custom_target(omptarget-nvptx-bc) - -# Generate a Bitcode library for all the compute capabilities the user requested -foreach(sm ${nvptx_sm_list}) - set(cuda_flags -Xclang -target-cpu -Xclang sm_${sm} "-D__CUDA_ARCH__=${sm}0") - set(bc_files "") - foreach(src ${cuda_src_files}) - get_filename_component(infile ${src} ABSOLUTE) - get_filename_component(outfile ${src} NAME) - set(outfile "${outfile}-sm_${sm}.bc") - - add_custom_command(OUTPUT ${outfile} - COMMAND ${cuda_compiler} ${bc_flags} - ${cuda_flags} ${MAX_SM_DEFINITION} ${infile} -o ${outfile} - DEPENDS ${infile} - IMPLICIT_DEPENDS CXX ${infile} - COMMENT "Building LLVM bitcode ${outfile}" - VERBATIM - ) - if("${cuda_compiler}" STREQUAL "$") - # Add a file-level dependency to ensure that clang is up-to-date. - # By default, add_custom_command only builds clang if the - # executable is missing. - add_custom_command(OUTPUT ${outfile} - DEPENDS clang - APPEND - ) - endif() - set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}) - - list(APPEND bc_files ${outfile}) - endforeach() - - set(bclib_name "libomptarget-nvptx-sm_${sm}.bc") - - # Link to a bitcode library. - add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} - COMMAND ${bc_linker} - -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files} - DEPENDS ${bc_files} - COMMENT "Linking LLVM bitcode ${bclib_name}" - ) - if("${bc_linker}" STREQUAL "$") - # Add a file-level dependency to ensure that llvm-link is up-to-date. - # By default, add_custom_command only builds llvm-link if the - # executable is missing. - add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} - DEPENDS llvm-link - APPEND - ) - endif() - set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name}) - - set(bclib_target_name "omptarget-nvptx-sm_${sm}-bc") - - add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}) - add_dependencies(omptarget-nvptx-bc ${bclib_target_name}) - - # Copy library to destination. - add_custom_command(TARGET ${bclib_target_name} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} - ${LIBOMPTARGET_LIBRARY_DIR}) - - # Install bitcode library under the lib destination folder. - install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}") -endforeach() - -# Test will be enabled if the building machine supports CUDA -if (LIBOMPTARGET_DEP_CUDA_FOUND) - add_subdirectory(test) -endif() diff --git a/openmp/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt b/openmp/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt +++ /dev/null @@ -1,523 +0,0 @@ - -**Design document for OpenMP reductions on the GPU** - -//Abstract: //In this document we summarize the new design for an OpenMP -implementation of reductions on NVIDIA GPUs. This document comprises -* a succinct background review, -* an introduction to the decoupling of reduction algorithm and - data-structure-specific processing routines, -* detailed illustrations of reduction algorithms used and -* a brief overview of steps we have made beyond the last implementation. - -**Problem Review** - -Consider a typical OpenMP program with reduction pragma. - -``` - double foo, bar; - #pragma omp parallel for reduction(+:foo, bar) - for (int i = 0; i < N; i++) { - foo+=A[i]; bar+=B[i]; - } -``` -where 'foo' and 'bar' are reduced across all threads in the parallel region. -Our primary goal is to efficiently aggregate the values of foo and bar in -such manner that -* makes the compiler logically concise. -* efficiently reduces within warps, threads, blocks and the device. - -**Introduction to Decoupling** -In this section we address the problem of making the compiler -//logically concise// by partitioning the task of reduction into two broad -categories: data-structure specific routines and algorithmic routines. - -The previous reduction implementation was highly coupled with -the specificity of the reduction element data structures (e.g., sizes, data -types) and operators of the reduction (e.g., addition, multiplication). In -our implementation we strive to decouple them. In our final implementations, -we could remove all template functions in our runtime system. - -The (simplified) pseudo code generated by LLVM is as follows: - -``` - 1. Create private copies of variables: foo_p, bar_p - 2. Each thread reduces the chunk of A and B assigned to it and writes - to foo_p and bar_p respectively. - 3. ret = kmpc_nvptx_reduce_nowait(..., reduceData, shuffleReduceFn, - interWarpCpyFn) - where: - struct ReduceData { - double *foo; - double *bar; - } reduceData - reduceData.foo = &foo_p - reduceData.bar = &bar_p - - shuffleReduceFn and interWarpCpyFn are two auxiliary functions - generated to aid the runtime performing algorithmic steps - while being data-structure agnostic about ReduceData. - - In particular, shuffleReduceFn is a function that takes the following - inputs: - a. local copy of ReduceData - b. its lane_id - c. the offset of the lane_id which hosts a remote ReduceData - relative to the current one - d. an algorithm version parameter determining which reduction - algorithm to use. - This shuffleReduceFn retrieves the remote ReduceData through shuffle - intrinsics and reduces, using the algorithm specified by the 4th - parameter, the local ReduceData and with the remote ReduceData element - wise, and places the resultant values into the local ReduceData. - - Different reduction algorithms are implemented with different runtime - functions, but they all make calls to this same shuffleReduceFn to - perform the essential reduction step. Therefore, based on the 4th - parameter, this shuffleReduceFn will behave slightly differently to - cooperate with the runtime function to ensure correctness under - different circumstances. - - InterWarpCpyFn, as the name suggests, is a function that copies data - across warps. Its function is to tunnel all the thread private - ReduceData that is already reduced within a warp to a lane in the first - warp with minimal shared memory footprint. This is an essential step to - prepare for the last step of a block reduction. - - (Warp, block, device level reduction routines that utilize these - auxiliary functions will be discussed in the next section.) - - 4. if ret == 1: - The master thread stores the reduced result in the globals. - foo += reduceData.foo; bar += reduceData.bar -``` - -**Reduction Algorithms** - -On the warp level, we have three versions of the algorithms: - -1. Full Warp Reduction - -``` -gpu_regular_warp_reduce(void *reduce_data, - kmp_ShuffleReductFctPtr ShuffleReduceFn) { - for (int offset = WARPSIZE/2; offset > 0; offset /= 2) - ShuffleReduceFn(reduce_data, 0, offset, 0); -} -``` -ShuffleReduceFn is used here with lane_id set to 0 because it is not used -therefore we save instructions by not retrieving lane_id from the corresponding -special registers. The 4th parameters, which represents the version of the -algorithm being used here, is set to 0 to signify full warp reduction. - -In this version specified (=0), the ShuffleReduceFn behaves, per element, as -follows: - -``` -//reduce_elem refers to an element in the local ReduceData -//remote_elem is retrieved from a remote lane -remote_elem = shuffle_down(reduce_elem, offset, 32); -reduce_elem = reduce_elem @ remote_elem; - -``` - -An illustration of this algorithm operating on a hypothetical 8-lane full-warp -would be: -{F74} -The coloring invariant follows that elements with the same color will be -combined and reduced in the next reduction step. As can be observed, no overhead -is present, exactly log(2, N) steps are needed. - -2. Contiguous Full Warp Reduction -``` -gpu_irregular_warp_reduce(void *reduce_data, - kmp_ShuffleReductFctPtr ShuffleReduceFn, int size, - int lane_id) { - int curr_size; - int offset; - curr_size = size; - mask = curr_size/2; - while (offset>0) { - ShuffleReduceFn(reduce_data, lane_id, offset, 1); - curr_size = (curr_size+1)/2; - offset = curr_size/2; - } -} -``` - -In this version specified (=1), the ShuffleReduceFn behaves, per element, as -follows: -``` -//reduce_elem refers to an element in the local ReduceData -//remote_elem is retrieved from a remote lane -remote_elem = shuffle_down(reduce_elem, offset, 32); -if (lane_id < offset) { - reduce_elem = reduce_elem @ remote_elem -} else { - reduce_elem = remote_elem -} -``` - -An important invariant (also a restriction on the starting state of the -reduction) is that this algorithm assumes that all unused ReduceData are -located in a contiguous subset of threads in a warp starting from lane 0. - -With the presence of a trailing active lane with an odd-numbered lane -id, its value will not be aggregated with any other lane. Therefore, -in order to preserve the invariant, such ReduceData is copied to the first lane -whose thread-local ReduceData has already being used in a previous reduction -and would therefore be useless otherwise. - -An illustration of this algorithm operating on a hypothetical 8-lane partial -warp woud be: -{F75} - -As illustrated, this version of the algorithm introduces overhead whenever -we have odd number of participating lanes in any reduction step to -copy data between lanes. - -3. Dispersed Partial Warp Reduction -``` -gpu_irregular_simt_reduce(void *reduce_data, - kmp_ShuffleReductFctPtr ShuffleReduceFn) { - int size, remote_id; - int logical_lane_id = find_number_of_dispersed_active_lanes_before_me() * 2; - do { - remote_id = find_the_next_active_lane_id_right_after_me(); - // the above function returns 0 of no active lane - // is present right after the current thread. - size = get_number_of_active_lanes_in_this_warp(); - logical_lane_id /= 2; - ShuffleReduceFn(reduce_data, logical_lane_id, remote_id-1-threadIdx.x, 2); - } while (logical_lane_id % 2 == 0 && size > 1); -``` - -There is no assumption made about the initial state of the reduction. -Any number of lanes (>=1) could be active at any position. The reduction -result is kept in the first active lane. - -In this version specified (=2), the ShuffleReduceFn behaves, per element, as -follows: -``` -//reduce_elem refers to an element in the local ReduceData -//remote_elem is retrieved from a remote lane -remote_elem = shuffle_down(reduce_elem, offset, 32); -if (LaneId % 2 == 0 && Offset > 0) { - reduce_elem = reduce_elem @ remote_elem -} else { - reduce_elem = remote_elem -} -``` -We will proceed with a brief explanation for some arguments passed in, -it is important to notice that, in this section, we will introduce the -concept of logical_lane_id, and it is important to distinguish it -from physical lane_id as defined by nvidia. -1. //logical_lane_id//: as the name suggests, it refers to the calculated - lane_id (instead of the physical one defined by nvidia) that would make - our algorithm logically concise. A thread with logical_lane_id k means - there are (k-1) threads before it. -2. //remote_id-1-threadIdx.x//: remote_id is indeed the nvidia-defined lane - id of the remote lane from which we will retrieve the ReduceData. We - subtract (threadIdx+1) from it because we would like to maintain only one - underlying shuffle intrinsic (which is used to communicate among lanes in a - warp). This particular version of shuffle intrinsic we take accepts only - offsets, instead of absolute lane_id. Therefore the subtraction is performed - on the absolute lane_id we calculated to obtain the offset. - -This algorithm is slightly different in 2 ways and it is not, conceptually, a -generalization of the above algorithms. -1. It reduces elements close to each other. For instance, values in the 0th lane - is to be combined with that of the 1st lane; values in the 2nd lane is to be - combined with that of the 3rd lane. We did not use the previous algorithm - where the first half of the (partial) warp is reduced with the second half - of the (partial) warp. This is because, the mapping - f(x): logical_lane_id -> physical_lane_id; - can be easily calculated whereas its inverse - f^-1(x): physical_lane_id -> logical_lane_id - cannot and performing such reduction requires the inverse to be known. -2. Because this algorithm is agnostic about the positions of the lanes that are - active, we do not need to perform the coping step as in the second - algorithm. -An illustrative run would look like -{F76} -As observed, overhead is high because in each and every step of reduction, -logical_lane_id is recalculated; so is the remote_id. - -On a block level, we have implemented the following block reduce algorithm: - -``` -gpu_irregular_block_reduce(void *reduce_data, - kmp_ShuffleReductFctPtr shuflReduceFn, - kmp_InterWarpCopyFctPtr interWarpCpyFn, - int size) { - - int wid = threadIdx.x/WARPSIZE; - int lane_id = threadIdx.x%WARPSIZE; - - int warp_needed = (size+WARPSIZE-1)/WARPSIZE; //ceiling of division - - unsigned tnum = __ballot(1); - int thread_num = __popc(tnum); - - //full warp reduction - if (thread_num == WARPSIZE) { - gpu_regular_warp_reduce(reduce_data, shuflReduceFn); - } - //partial warp reduction - if (thread_num < WARPSIZE) { - gpu_irregular_warp_reduce(reduce_data, shuflReduceFn, thread_num, - lane_id); - } - //Gather all the reduced values from each warp - //to the first warp - //named_barrier inside this function to ensure - //correctness. It is effectively a sync_thread - //that won't deadlock. - interWarpCpyFn(reduce_data, warp_needed); - - //This is to reduce data gathered from each "warp master". - if (wid==0) { - gpu_irregular_warp_reduce(reduce_data, shuflReduceFn, warp_needed, - lane_id); - } - - return; -} -``` -In this function, no ShuffleReduceFn is directly called as it makes calls -to various versions of the warp-reduction functions. It first reduces -ReduceData warp by warp; in the end, we end up with the number of -ReduceData equal to the number of warps present in this thread -block. We then proceed to gather all such ReduceData to the first warp. - -As observed, in this algorithm we make use of the function InterWarpCpyFn, -which copies data from each of the "warp master" (0th lane of each warp, where -a warp-reduced ReduceData is held) to the 0th warp. This step reduces (in a -mathematical sense) the problem of reduction across warp masters in a block to -the problem of warp reduction which we already have solutions to. - -We can thus completely avoid the use of atomics to reduce in a threadblock. - -**Efficient Cross Block Reduce** - -The next challenge is to reduce values across threadblocks. We aim to do this -without atomics or critical sections. - -Let a kernel be started with TB threadblocks. -Let the GPU have S SMs. -There can be at most N active threadblocks per SM at any time. - -Consider a threadblock tb (tb < TB) running on SM s (s < SM). 'tb' is one of -at most 'N' active threadblocks on SM s. Let each threadblock active on an SM -be given an instance identifier id (0 <= id < N). Therefore, the tuple (s, id) -uniquely identifies an active threadblock on the GPU. - -To efficiently implement cross block reduce, we first allocate an array for -each value to be reduced of size S*N (which is the maximum number of active -threadblocks at any time on the device). - -Each threadblock reduces its value to slot [s][id]. This can be done without -locking since no other threadblock can write to the same slot concurrently. - -As a final stage, we reduce the values in the array as follows: - -``` -// Compiler generated wrapper function for each target region with a reduction -clause. -target_function_wrapper(map_args, reduction_array) <--- start with 1 team and 1 - thread. - // Use dynamic parallelism to launch M teams, N threads as requested by the - user to execute the target region. - - target_function<>(map_args) - - Reduce values in reduction_array - -``` - -**Comparison with Last Version** - - -The (simplified) pseudo code generated by LLVM on the host is as follows: - - -``` - 1. Create private copies of variables: foo_p, bar_p - 2. Each thread reduces the chunk of A and B assigned to it and writes - to foo_p and bar_p respectively. - 3. ret = kmpc_reduce_nowait(..., reduceData, reduceFn, lock) - where: - struct ReduceData { - double *foo; - double *bar; - } reduceData - reduceData.foo = &foo_p - reduceData.bar = &bar_p - - reduceFn is a pointer to a function that takes in two inputs - of type ReduceData, "reduces" them element wise, and places the - result in the first input: - reduceFn(ReduceData *a, ReduceData *b) - a = a @ b - - Every thread in the parallel region calls kmpc_reduce_nowait with - its private copy of reduceData. The runtime reduces across the - threads (using tree reduction on the operator 'reduceFn?) and stores - the final result in the master thread if successful. - 4. if ret == 1: - The master thread stores the reduced result in the globals. - foo += reduceData.foo; bar += reduceData.bar - 5. else if ret == 2: - In this case kmpc_reduce_nowait() could not use tree reduction, - so use atomics instead: - each thread atomically writes to foo - each thread atomically writes to bar -``` - -On a GPU, a similar reduction may need to be performed across SIMT threads, -warps, and threadblocks. The challenge is to do so efficiently in a fashion -that is compatible with the LLVM OpenMP implementation. - -In the previously released 0.1 version of the LLVM OpenMP compiler for GPUs, -the salient steps of the code generated are as follows: - - -``` - 1. Create private copies of variables: foo_p, bar_p - 2. Each thread reduces the chunk of A and B assigned to it and writes - to foo_p and bar_p respectively. - 3. ret = kmpc_reduce_nowait(..., reduceData, reduceFn, lock) - status = can_block_reduce() - if status == 1: - reduce efficiently to thread 0 using shuffles and shared memory. - return 1 - else - cannot use efficient block reduction, fallback to atomics - return 2 - 4. if ret == 1: - The master thread stores the reduced result in the globals. - foo += reduceData.foo; bar += reduceData.bar - 5. else if ret == 2: - In this case kmpc_reduce_nowait() could not use tree reduction, - so use atomics instead: - each thread atomically writes to foo - each thread atomically writes to bar -``` - -The function can_block_reduce() is defined as follows: - - -``` -int32_t can_block_reduce() { - int tid = GetThreadIdInTeam(); - int nt = GetNumberOfOmpThreads(tid); - if (nt != blockDim.x) - return 0; - unsigned tnum = __ballot(1); - if (tnum != (~0x0)) { - return 0; - } - return 1; -} -``` - -This function permits the use of the efficient block reduction algorithm -using shuffles and shared memory (return 1) only if (a) all SIMT threads in -a warp are active (i.e., number of threads in the parallel region is a -multiple of 32) and (b) the number of threads in the parallel region -(set by the num_threads clause) equals blockDim.x. - -If either of these preconditions is not true, each thread in the threadblock -updates the global value using atomics. - -Atomics and compare-and-swap operations are expensive on many threaded -architectures such as GPUs and we must avoid them completely. - - -**Appendix: Implementation Details** - - -``` -// Compiler generated function. -reduceFn(ReduceData *a, ReduceData *b) - a->foo = a->foo + b->foo - a->bar = a->bar + b->bar - -// Compiler generated function. -swapAndReduceFn(ReduceData *thread_private, int lane) - ReduceData *remote = new ReduceData() - remote->foo = shuffle_double(thread_private->foo, lane) - remote->bar = shuffle_double(thread_private->bar, lane) - reduceFn(thread_private, remote) - -// OMP runtime function. -warpReduce_regular(ReduceData *thread_private, Fn *swapAndReduceFn): - offset = 16 - while (offset > 0) - swapAndReduceFn(thread_private, offset) - offset /= 2 - -// OMP runtime function. -warpReduce_irregular(): - ... - -// OMP runtime function. -kmpc_reduce_warp(reduceData, swapAndReduceFn) - if all_lanes_active: - warpReduce_regular(reduceData, swapAndReduceFn) - else: - warpReduce_irregular(reduceData, swapAndReduceFn) - if in_simd_region: - // all done, reduce to global in simd lane 0 - return 1 - else if in_parallel_region: - // done reducing to one value per warp, now reduce across warps - return 3 - -// OMP runtime function; one for each basic type. -kmpc_reduce_block_double(double *a) - if lane == 0: - shared[wid] = *a - named_barrier(1, num_threads) - if wid == 0 - block_reduce(shared) - if lane == 0 - *a = shared[0] - named_barrier(1, num_threads) - if wid == 0 and lane == 0 - return 1 // write back reduced result - else - return 0 // don't do anything - -``` - - - -``` -// Compiler generated code. - 1. Create private copies of variables: foo_p, bar_p - 2. Each thread reduces the chunk of A and B assigned to it and writes - to foo_p and bar_p respectively. - 3. ret = kmpc_reduce_warp(reduceData, swapAndReduceFn) - 4. if ret == 1: - The master thread stores the reduced result in the globals. - foo += reduceData.foo; bar += reduceData.bar - 5. else if ret == 3: - ret = block_reduce_double(reduceData.foo) - if ret == 1: - foo += reduceData.foo - ret = block_reduce_double(reduceData.bar) - if ret == 1: - bar += reduceData.bar -``` - -**Notes** - - 1. This scheme requires that the CUDA OMP runtime can call llvm generated - functions. This functionality now works. - 2. If the user inlines the CUDA OMP runtime bitcode, all of the machinery - (including calls through function pointers) are optimized away. - 3. If we are reducing multiple to multiple variables in a parallel region, - the reduce operations are all performed in warpReduce_[ir]regular(). This - results in more instructions in the loop and should result in fewer - stalls due to data dependencies. Unfortunately we cannot do the same in - kmpc_reduce_block_double() without increasing shared memory usage. diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h b/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h +++ /dev/null @@ -1,17 +0,0 @@ -//===--- nvptx_interface.h - OpenMP interface definitions -------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef _NVPTX_INTERFACE_H_ -#define _NVPTX_INTERFACE_H_ - -#include - -#define EXTERN extern "C" -typedef uint32_t omp_lock_t; /* arbitrary type of the right length */ - -#endif diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ /dev/null @@ -1,89 +0,0 @@ -//===------------ target_impl.h - NVPTX OpenMP GPU options ------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Definitions of target specific functions -// -//===----------------------------------------------------------------------===// -#ifndef _TARGET_IMPL_H_ -#define _TARGET_IMPL_H_ - -#include "nvptx_interface.h" - -#include -#include - -// subset of inttypes.h -#define PRId64 "ld" -#define PRIu64 "lu" - -typedef uint32_t __kmpc_impl_lanemask_t; - -#define INLINE inline __attribute__((always_inline)) -#define NOINLINE __attribute__((noinline)) -#define ALIGN(N) __attribute__((aligned(N))) -#define PLUGIN_ACCESSIBLE /* no annotation needed for cuda plugin */ - -#include "llvm/Frontend/OpenMP/OMPGridValues.h" - -INLINE constexpr const llvm::omp::GV &getGridValue() { - return llvm::omp::NVPTXGridValues; -} - -//////////////////////////////////////////////////////////////////////////////// -// Kernel options -//////////////////////////////////////////////////////////////////////////////// - -//////////////////////////////////////////////////////////////////////////////// -// The following def must match the absolute limit hardwired in the host RTL -// max number of threads per team -enum { MAX_THREADS_PER_TEAM = getGridValue().GV_Max_WG_Size }; -enum { WARPSIZE = getGridValue().GV_Warp_Size }; - -// Maximum number of omp state objects per SM allocated statically in global -// memory. -#if __CUDA_ARCH__ >= 600 -#define OMP_STATE_COUNT 32 -#else -#define OMP_STATE_COUNT 16 -#endif - -#if !defined(MAX_SM) -#if __CUDA_ARCH__ >= 900 -#error unsupported compute capability, define MAX_SM via LIBOMPTARGET_NVPTX_MAX_SM cmake option -#elif __CUDA_ARCH__ >= 800 -// GA100 design has a maxinum of 128 SMs but A100 product only has 108 SMs -// GA102 design has a maxinum of 84 SMs -#define MAX_SM 108 -#elif __CUDA_ARCH__ >= 700 -#define MAX_SM 84 -#elif __CUDA_ARCH__ >= 600 -#define MAX_SM 56 -#else -#define MAX_SM 16 -#endif -#endif - -#define OMP_ACTIVE_PARALLEL_LEVEL 128 - -// Data sharing related quantities, need to match what is used in the compiler. -enum DATA_SHARING_SIZES { - // The size reserved for data in a shared memory slot. - DS_Slot_Size = getGridValue().GV_Slot_Size, - // The slot size that should be reserved for a working warp. - DS_Worker_Warp_Slot_Size = getGridValue().warpSlotSize(), - // The maximum number of warps in use - DS_Max_Warp_Number = getGridValue().maxWarpNumber(), -}; - -enum : __kmpc_impl_lanemask_t { - __kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0 -}; - -#define printf(...) - -#endif diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu +++ /dev/null @@ -1,198 +0,0 @@ -//===---------- target_impl.cu - NVPTX OpenMP GPU options ------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Definitions of target specific functions -// -//===----------------------------------------------------------------------===// -#pragma omp declare target - -#include "common/debug.h" -#include "target_impl.h" -#include "target_interface.h" - -EXTERN void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) { - asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val)); -} - -EXTERN uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) { - uint64_t val; - asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi)); - return val; -} - -EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() { - __kmpc_impl_lanemask_t res; - asm("mov.u32 %0, %%lanemask_lt;" : "=r"(res)); - return res; -} - -EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() { - __kmpc_impl_lanemask_t res; - asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res)); - return res; -} - -EXTERN uint32_t __kmpc_impl_smid() { - uint32_t id; - asm("mov.u32 %0, %%smid;" : "=r"(id)); - return id; -} - -EXTERN double __kmpc_impl_get_wtick() { - // Timer precision is 1ns - return ((double)1E-9); -} - -EXTERN double __kmpc_impl_get_wtime() { - unsigned long long nsecs; - asm("mov.u64 %0, %%globaltimer;" : "=l"(nsecs)); - return (double)nsecs * __kmpc_impl_get_wtick(); -} - -EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() { - unsigned int Mask; - asm volatile("activemask.b32 %0;" : "=r"(Mask)); - return Mask; -} - -EXTERN void __kmpc_impl_syncthreads() { - int barrier = 2; - asm volatile("barrier.sync %0;" - : - : "r"(barrier) - : "memory"); -} - -EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) { - __nvvm_bar_warp_sync(Mask); -} - -// NVPTX specific kernel initialization -EXTERN void __kmpc_impl_target_init() { /* nvptx needs no extra setup */ -} - -// Barrier until num_threads arrive. -EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) { - // The named barrier for active parallel threads of a team in an L1 parallel - // region to synchronize with each other. - int barrier = 1; - asm volatile("barrier.sync %0, %1;" - : - : "r"(barrier), "r"(num_threads) - : "memory"); -} - -EXTERN void __kmpc_impl_threadfence() { __nvvm_membar_gl(); } -EXTERN void __kmpc_impl_threadfence_block() { __nvvm_membar_cta(); } -EXTERN void __kmpc_impl_threadfence_system() { __nvvm_membar_sys(); } - -// Calls to the NVPTX layer (assuming 1D layout) -EXTERN int __kmpc_get_hardware_thread_id_in_block() { - return __nvvm_read_ptx_sreg_tid_x(); -} -EXTERN int GetBlockIdInKernel() { return __nvvm_read_ptx_sreg_ctaid_x(); } -EXTERN int __kmpc_get_hardware_num_blocks() { - return __nvvm_read_ptx_sreg_nctaid_x(); -} -EXTERN int __kmpc_get_hardware_num_threads_in_block() { - return __nvvm_read_ptx_sreg_ntid_x(); -} -EXTERN unsigned __kmpc_get_warp_size() { return WARPSIZE; } -EXTERN unsigned GetWarpId() { - return __kmpc_get_hardware_thread_id_in_block() / WARPSIZE; -} -EXTERN unsigned GetLaneId() { - return __kmpc_get_hardware_thread_id_in_block() & (WARPSIZE - 1); -} - -// Atomics -uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) { - return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST); -} -uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) { - return __nvvm_atom_inc_gen_ui(Address, Val); -} - -uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) { - return __atomic_fetch_max(Address, Val, __ATOMIC_SEQ_CST); -} - -uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) { - uint32_t R; - __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST); - return R; -} - -uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare, uint32_t Val) { - (void)__atomic_compare_exchange(Address, &Compare, &Val, false, - __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); - return Compare; -} - -unsigned long long __kmpc_atomic_exchange(unsigned long long *Address, - unsigned long long Val) { - unsigned long long R; - __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST); - return R; -} - -unsigned long long __kmpc_atomic_add(unsigned long long *Address, - unsigned long long Val) { - return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST); -} - -#define __OMP_SPIN 1000 -#define UNSET 0u -#define SET 1u - -EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock) { - __kmpc_impl_unset_lock(lock); -} - -EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock) { - __kmpc_impl_unset_lock(lock); -} - -EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock) { - // TODO: not sure spinning is a good idea here.. - while (__kmpc_atomic_cas(lock, UNSET, SET) != UNSET) { - int32_t start = __nvvm_read_ptx_sreg_clock(); - int32_t now; - for (;;) { - now = __nvvm_read_ptx_sreg_clock(); - int32_t cycles = now > start ? now - start : now + (0xffffffff - start); - if (cycles >= __OMP_SPIN * GetBlockIdInKernel()) { - break; - } - } - } // wait for 0 to be the read value -} - -EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock) { - (void)__kmpc_atomic_exchange(lock, UNSET); -} - -EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock) { - return __kmpc_atomic_add(lock, 0u); -} - -extern "C" { -void *malloc(size_t); -void free(void *); -int32_t vprintf(const char *, void *); -} - -EXTERN void *__kmpc_impl_malloc(size_t x) { return malloc(x); } -EXTERN void __kmpc_impl_free(void *x) { free(x); } - -EXTERN int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, - uint32_t) { - return vprintf(Format, Arguments); -} - -#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/test/CMakeLists.txt deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/CMakeLists.txt +++ /dev/null @@ -1,25 +0,0 @@ -if(NOT OPENMP_TEST_COMPILER_ID STREQUAL "Clang") - # Silently return, no need to annoy the user. - return() -endif() - -set(deps omptarget omp) -if(LIBOMPTARGET_NVPTX_ENABLE_BCLIB) - set(deps ${deps} omptarget-nvptx-bc) -endif() - -# Run with only one thread to only launch one application to the GPU at a time. -add_openmp_testsuite(check-libomptarget-nvptx - "Running libomptarget-nvptx tests" ${CMAKE_CURRENT_BINARY_DIR} - EXCLUDE_FROM_CHECK_ALL - DEPENDS ${deps} ARGS -j1) - -set(LIBOMPTARGET_NVPTX_TEST_FLAGS "" CACHE STRING - "Extra compiler flags to send to the test compiler.") -set(LIBOMPTARGET_NVPTX_TEST_OPENMP_FLAGS - "-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda" CACHE STRING - "OpenMP compiler flags to use for testing libomptarget-nvptx.") - -# Configure the lit.site.cfg.in file -set(AUTO_GEN_COMMENT "## Autogenerated by libomptarget-nvptx configuration.\n# Do not edit!") -configure_file(lit.site.cfg.in lit.site.cfg @ONLY) diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/api/get_max_threads.c b/openmp/libomptarget/deviceRTLs/nvptx/test/api/get_max_threads.c deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/api/get_max_threads.c +++ /dev/null @@ -1,22 +0,0 @@ -// RUN: %compile-run-and-check -#include -#include - -int main(){ - int max_threads = -1; - int num_threads = -1; - - #pragma omp target map(tofrom: max_threads) - max_threads = omp_get_max_threads(); - - #pragma omp target parallel map(tofrom: num_threads) - { - #pragma omp master - num_threads = omp_get_num_threads(); - } - - // CHECK: Max Threads: 128, Num Threads: 128 - printf("Max Threads: %d, Num Threads: %d\n", max_threads, num_threads); - - return 0; -} diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/api/ignored.c b/openmp/libomptarget/deviceRTLs/nvptx/test/api/ignored.c deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/api/ignored.c +++ /dev/null @@ -1,38 +0,0 @@ -// RUN: %compile-run-and-check - -#include -#include - -const int MaxThreads = 1024; - -int main(int argc, char *argv[]) { - int cancellation = -1, dynamic = -1, nested = -1, maxActiveLevels = -1; - - #pragma omp target map(cancellation, dynamic, nested, maxActiveLevels) - { - // libomptarget-nvptx doesn't support cancellation. - cancellation = omp_get_cancellation(); - - // No support for dynamic adjustment of the number of threads. - omp_set_dynamic(1); - dynamic = omp_get_dynamic(); - - // libomptarget-nvptx doesn't support nested parallelism. - omp_set_nested(1); - nested = omp_get_nested(); - - omp_set_max_active_levels(42); - maxActiveLevels = omp_get_max_active_levels(); - } - - // CHECK: cancellation = 0 - printf("cancellation = %d\n", cancellation); - // CHECK: dynamic = 0 - printf("dynamic = %d\n", dynamic); - // CHECK: nested = 0 - printf("nested = %d\n", nested); - // CHECK: maxActiveLevels = 1 - printf("maxActiveLevels = %d\n", maxActiveLevels); - - return 0; -} diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/api/max_threads.c b/openmp/libomptarget/deviceRTLs/nvptx/test/api/max_threads.c deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/api/max_threads.c +++ /dev/null @@ -1,53 +0,0 @@ -// RUN: %compile-run-and-check - -#include -#include - -int main(int argc, char *argv[]) { - int MaxThreadsL1 = -1, MaxThreadsL2 = -1; - -#pragma omp declare reduction(unique:int \ - : omp_out = (omp_in == 1 ? omp_in : omp_out)) \ - initializer(omp_priv = -1) - - // Non-SPMD mode. -#pragma omp target teams map(MaxThreadsL1, MaxThreadsL2) thread_limit(32) \ - num_teams(1) - { - MaxThreadsL1 = omp_get_max_threads(); -#pragma omp parallel reduction(unique : MaxThreadsL2) - { MaxThreadsL2 = omp_get_max_threads(); } - } - - //FIXME: This Non-SPMD kernel will have 32 active threads due to - // thread_limit. However, Non-SPMD MaxThreadsL1 is the total number of - // threads in block (64 in this case), which translates to worker - // threads + WARP_SIZE for Non-SPMD kernels and worker threads for SPMD - // kernels. According to the spec, omp_get_max_threads must return the - // max active threads possible between the two kernel types. - - // CHECK: Non-SPMD MaxThreadsL1 = 64 - printf("Non-SPMD MaxThreadsL1 = %d\n", MaxThreadsL1); - // CHECK: Non-SPMD MaxThreadsL2 = 1 - printf("Non-SPMD MaxThreadsL2 = %d\n", MaxThreadsL2); - - // SPMD mode with full runtime - MaxThreadsL2 = -1; -#pragma omp target parallel reduction(unique : MaxThreadsL2) - { MaxThreadsL2 = omp_get_max_threads(); } - - // CHECK: SPMD with full runtime MaxThreadsL2 = 1 - printf("SPMD with full runtime MaxThreadsL2 = %d\n", MaxThreadsL2); - - // SPMD mode without runtime - MaxThreadsL2 = -1; -#pragma omp target parallel for reduction(unique : MaxThreadsL2) - for (int I = 0; I < 2; ++I) { - MaxThreadsL2 = omp_get_max_threads(); - } - - // CHECK: SPMD without runtime MaxThreadsL2 = 1 - printf("SPMD without runtime MaxThreadsL2 = %d\n", MaxThreadsL2); - - return 0; -} diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/api/thread_limit.c b/openmp/libomptarget/deviceRTLs/nvptx/test/api/thread_limit.c deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/api/thread_limit.c +++ /dev/null @@ -1,72 +0,0 @@ -// RUN: %compile-run-and-check - -#include -#include - -int main(int argc, char *argv[]) { - int ThreadLimitL0 = -1, ThreadLimitL1 = -1, ThreadLimitL2 = -1; - -#pragma omp declare reduction(unique64:int \ - : omp_out = (omp_in == 64 ? omp_in : omp_out)) \ - initializer(omp_priv = -1) -#pragma omp declare reduction(unique32:int \ - : omp_out = (omp_in == 32 ? omp_in : omp_out)) \ - initializer(omp_priv = -1) - - // Non-SPMD mode. -#pragma omp target teams map(ThreadLimitL0, ThreadLimitL1, ThreadLimitL2) \ - thread_limit(64) num_teams(1) - { - ThreadLimitL0 = omp_get_thread_limit(); -#pragma omp parallel reduction(unique64 \ - : ThreadLimitL1, ThreadLimitL2) num_threads(32) - { - ThreadLimitL1 = omp_get_thread_limit(); -#pragma omp parallel reduction(unique64 : ThreadLimitL2) - { ThreadLimitL2 = omp_get_thread_limit(); } - } - } - - // CHECK: Non-SPMD ThreadLimitL0 = 64 - printf("Non-SPMD ThreadLimitL0 = %d\n", ThreadLimitL0); - // CHECK: Non-SPMD ThreadLimitL1 = 64 - printf("Non-SPMD ThreadLimitL1 = %d\n", ThreadLimitL1); - // CHECK: Non-SPMD ThreadLimitL2 = 64 - printf("Non-SPMD ThreadLimitL2 = %d\n", ThreadLimitL2); - - // SPMD mode with full runtime - ThreadLimitL1 = -1; - ThreadLimitL2 = -1; -#pragma omp target parallel reduction(unique32 \ - : ThreadLimitL1, ThreadLimitL2) \ - num_threads(32) - { - ThreadLimitL1 = omp_get_thread_limit(); -#pragma omp parallel reduction(unique32 : ThreadLimitL2) - { ThreadLimitL2 = omp_get_thread_limit(); } - } - - // CHECK: SPMD with full runtime ThreadLimitL1 = 32 - printf("SPMD with full runtime ThreadLimitL1 = %d\n", ThreadLimitL1); - // CHECK: SPMD with full runtime ThreadLimitL2 = 32 - printf("SPMD with full runtime ThreadLimitL2 = %d\n", ThreadLimitL2); - - // SPMD mode without runtime - ThreadLimitL1 = -1; - ThreadLimitL2 = -1; -#pragma omp target parallel for reduction(unique32 \ - : ThreadLimitL1, ThreadLimitL2) \ - num_threads(32) - for (int I = 0; I < 2; ++I) { - ThreadLimitL1 = omp_get_thread_limit(); -#pragma omp parallel reduction(unique32 : ThreadLimitL2) - { ThreadLimitL2 = omp_get_thread_limit(); } - } - - // CHECK: SPMD without runtime ThreadLimitL1 = 32 - printf("SPMD without runtime ThreadLimitL1 = %d\n", ThreadLimitL1); - // CHECK: SPMD without runtime ThreadLimitL2 = 32 - printf("SPMD without runtime ThreadLimitL2 = %d\n", ThreadLimitL2); - - return 0; -} diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/data_sharing/alignment.c b/openmp/libomptarget/deviceRTLs/nvptx/test/data_sharing/alignment.c deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/data_sharing/alignment.c +++ /dev/null @@ -1,55 +0,0 @@ -// RUN: %compile-run-and-check - -#include -#include - -#pragma omp declare target -static void putValueInParallel(int *ptr, int value) { - #pragma omp parallel - { - *ptr = value; - } -} - -static int getId() { - int id; - putValueInParallel(&id, omp_get_thread_num()); - return id; -} -#pragma omp end declare target - -const int MaxThreads = 1024; -const int Threads = 64; - -int main(int argc, char *argv[]) { - int master; - int check[MaxThreads]; - for (int i = 0; i < MaxThreads; i++) { - check[i] = 0; - } - - #pragma omp target map(master, check[:]) - { - master = getId(); - - #pragma omp parallel num_threads(Threads) - { - check[omp_get_thread_num()] = getId(); - } - } - - // CHECK: master = 0. - printf("master = %d.\n", master); - // CHECK-NOT: invalid - for (int i = 0; i < MaxThreads; i++) { - if (i < Threads) { - if (check[i] != i) { - printf("invalid: check[%d] should be %d, is %d\n", i, i, check[i]); - } - } else if (check[i] != 0) { - printf("invalid: check[%d] should be 0, is %d\n", i, check[i]); - } - } - - return 0; -} diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/lit.cfg b/openmp/libomptarget/deviceRTLs/nvptx/test/lit.cfg deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/lit.cfg +++ /dev/null @@ -1,76 +0,0 @@ -# -*- Python -*- vim: set ft=python ts=4 sw=4 expandtab tw=79: -# Configuration file for the 'lit' test runner. - -import os -import lit.formats - -# Tell pylint that we know config and lit_config exist somewhere. -if 'PYLINT_IMPORT' in os.environ: - config = object() - lit_config = object() - -def prepend_library_path(name, value, sep): - if name in config.environment: - config.environment[name] = value + sep + config.environment[name] - else: - config.environment[name] = value - -# name: The name of this test suite. -config.name = 'libomptarget-nvptx' - -# suffixes: A list of file extensions to treat as test files. -config.suffixes = ['.c', '.cpp', '.cc'] - -# test_source_root: The root path where tests are located. -config.test_source_root = os.path.dirname(__file__) - -# test_exec_root: The root object directory where output is placed -config.test_exec_root = config.binary_dir - -# test format -config.test_format = lit.formats.ShTest() - -# compiler flags -config.test_flags = " -I " + config.omp_header_directory + \ - " -L " + config.library_dir - -if config.omp_host_rtl_directory: - config.test_flags = config.test_flags + \ - " -L " + config.omp_host_rtl_directory - -config.test_flags = config.test_flags + " " + config.test_extra_flags - -# Setup environment to find dynamic library at runtime. -prepend_library_path('LIBRARY_PATH', config.library_dir, ":") -prepend_library_path('LD_LIBRARY_PATH', config.library_dir, ":") -prepend_library_path('LD_LIBRARY_PATH', config.omp_host_rtl_directory, ":") -if config.cuda_libdir: - prepend_library_path('LD_LIBRARY_PATH', config.cuda_libdir, ":") - -# Forbid fallback to host. -config.environment["OMP_TARGET_OFFLOAD"] = "MANDATORY" - -# substitutions -config.substitutions.append(("%compilexx-run-and-check", - "%compilexx-and-run | " + config.libomptarget_filecheck + " %s")) -config.substitutions.append(("%compile-run-and-check", - "%compile-and-run | " + config.libomptarget_filecheck + " %s")) -config.substitutions.append(("%compilexx-and-run", "%compilexx && %run")) -config.substitutions.append(("%compile-and-run", "%compile && %run")) - -config.substitutions.append(("%compilexx", - "%clangxx %openmp_flags %cuda_flags %flags %s -o %t")) -config.substitutions.append(("%compile", - "%clang %openmp_flags %cuda_flags %flags %s -o %t")) - -config.substitutions.append(("%clangxx", config.test_cxx_compiler)) -config.substitutions.append(("%clang", config.test_c_compiler)) -config.substitutions.append(("%openmp_flags", config.test_openmp_flags)) -if config.cuda_path: - config.substitutions.append(("%cuda_flags", "--cuda-path=" + config.cuda_path)) -else: - config.substitutions.append(("%cuda_flags", "")) -config.substitutions.append(("%flags", config.test_flags)) - -config.substitutions.append(("%run", "%t")) -config.substitutions.append(("%not", config.libomptarget_not)) diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/lit.site.cfg.in b/openmp/libomptarget/deviceRTLs/nvptx/test/lit.site.cfg.in deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/lit.site.cfg.in +++ /dev/null @@ -1,17 +0,0 @@ -@AUTO_GEN_COMMENT@ - -config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@" -config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@" -config.test_openmp_flags = "@LIBOMPTARGET_NVPTX_TEST_OPENMP_FLAGS@" -config.test_extra_flags = "@LIBOMPTARGET_NVPTX_TEST_FLAGS@" -config.cuda_path = "@CUDA_TOOLKIT_ROOT_DIR@" -config.cuda_libdir = "@CUDA_LIBDIR@" -config.binary_dir = "@CMAKE_CURRENT_BINARY_DIR@" -config.library_dir = "@LIBOMPTARGET_LIBRARY_DIR@" -config.omp_header_directory = "@LIBOMPTARGET_OPENMP_HEADER_FOLDER@" -config.omp_host_rtl_directory = "@LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER@" -config.libomptarget_filecheck = "@OPENMP_FILECHECK_EXECUTABLE@" -config.libomptarget_not = "@OPENMP_NOT_EXECUTABLE@" - -# Let the main config do the real work. -lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg") diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/barrier.c b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/barrier.c deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/barrier.c +++ /dev/null @@ -1,37 +0,0 @@ -// RUN: %compile-run-and-check - -#include -#include - -int main(int argc, char *argv[]) { - int data, out, flag = 0; -#pragma omp target teams num_teams(2) map(tofrom \ - : out) map(to \ - : data, flag) \ - thread_limit(1) -#pragma omp parallel num_threads(1) - { - if (omp_get_team_num() == 0) { - /* Write to the data buffer that will be read by thread in team 1 */ - data = 42; -/* Flush data to thread in team 1 */ -#pragma omp barrier - /* Set flag to release thread in team 1 */ -#pragma omp atomic write - flag = 1; - } else if (omp_get_team_num() == 1) { - /* Loop until we see the update to the flag */ - int val; - do { -#pragma omp atomic read - val = flag; - } while (val < 1); - out = data; -#pragma omp barrier - } - } - // CHECK: out=42. - /* Value of out will be 42 */ - printf("out=%d.\n", out); - return !(out == 42); -} diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/flush.c b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/flush.c deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/flush.c +++ /dev/null @@ -1,35 +0,0 @@ -// RUN: %compile-run-and-check - -#include -#include - -int main(int argc, char *argv[]) { - int data, out, flag = 0; -#pragma omp target parallel num_threads(64) map(tofrom \ - : out, flag) map(to \ - : data) - { - if (omp_get_thread_num() == 0) { - /* Write to the data buffer that will be read by thread */ - data = 42; -/* Flush data to thread 32 */ -#pragma omp flush(data) - /* Set flag to release thread 32 */ -#pragma omp atomic write - flag = 1; - } else if (omp_get_thread_num() == 32) { - /* Loop until we see the update to the flag */ - int val; - do { -#pragma omp atomic read - val = flag; - } while (val < 1); - out = data; -#pragma omp flush(out) - } - } - // CHECK: out=42. - /* Value of out will be 42 */ - printf("out=%d.\n", out); - return !(out == 42); -} diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/level.c b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/level.c deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/level.c +++ /dev/null @@ -1,151 +0,0 @@ -// RUN: %compile-run-and-check - -#include -#include - -const int MaxThreads = 1024; -const int NumThreads = 64; - -int main(int argc, char *argv[]) { - int level = -1, activeLevel = -1; - // The expected value is -1, initialize to different value. - int ancestorTNumNeg = 1, teamSizeNeg = 1; - int ancestorTNum0 = -1, teamSize0 = -1; - // The expected value is -1, initialize to different value. - int ancestorTNum1 = 1, teamSize1 = 1; - int check1[MaxThreads]; - int check2[MaxThreads]; - int check3[MaxThreads]; - int check4[MaxThreads]; - for (int i = 0; i < MaxThreads; i++) { - check1[i] = check2[i] = check3[i] = check4[i] = 0; - } - - #pragma omp target map(level, activeLevel, ancestorTNumNeg, teamSizeNeg) \ - map(ancestorTNum0, teamSize0, ancestorTNum1, teamSize1) \ - map(check1[:], check2[:], check3[:], check4[:]) - { - level = omp_get_level(); - activeLevel = omp_get_active_level(); - - // Expected to return -1. - ancestorTNumNeg = omp_get_ancestor_thread_num(-1); - teamSizeNeg = omp_get_team_size(-1); - - // Expected to return 0 and 1. - ancestorTNum0 = omp_get_ancestor_thread_num(0); - teamSize0 = omp_get_team_size(0); - - // Expected to return -1 because the requested level is larger than - // the nest level. - ancestorTNum1 = omp_get_ancestor_thread_num(1); - teamSize1 = omp_get_team_size(1); - - // Expecting active parallel region. - #pragma omp parallel num_threads(NumThreads) - { - int id = omp_get_thread_num(); - // Multiply return value of omp_get_level by 5 to avoid that this test - // passes if both API calls return wrong values. - check1[id] += omp_get_level() * 5 + omp_get_active_level(); - - // Expected to return 0 and 1. - check2[id] += omp_get_ancestor_thread_num(0) + 5 * omp_get_team_size(0); - // Expected to return the current thread num. - check2[id] += (omp_get_ancestor_thread_num(1) - id); - // Expected to return the current number of threads. - check2[id] += 3 * omp_get_team_size(1); - // Expected to return -1, see above. - check2[id] += omp_get_ancestor_thread_num(2) + omp_get_team_size(2); - - // Expecting serialized parallel region. - #pragma omp parallel - { - #pragma omp atomic - check3[id] += omp_get_level() * 5 + omp_get_active_level(); - - // Expected to return 0 and 1. - int check4Inc = omp_get_ancestor_thread_num(0) + 5 * omp_get_team_size(0); - // Expected to return the parent thread num. - check4Inc += (omp_get_ancestor_thread_num(1) - id); - // Expected to return the number of threads in the active parallel region. - check4Inc += 3 * omp_get_team_size(1); - // Expected to return 0 and 1. - check4Inc += omp_get_ancestor_thread_num(2) + 3 * omp_get_team_size(2); - // Expected to return -1, see above. - check4Inc += omp_get_ancestor_thread_num(3) + omp_get_team_size(3); - - #pragma omp atomic - check4[id] += check4Inc; - } - } - } - - // CHECK: target: level = 0, activeLevel = 0 - printf("target: level = %d, activeLevel = %d\n", level, activeLevel); - // CHECK: level = -1: ancestorTNum = -1, teamSize = -1 - printf("level = -1: ancestorTNum = %d, teamSize = %d\n", ancestorTNumNeg, teamSizeNeg); - // CHECK: level = 0: ancestorTNum = 0, teamSize = 1 - printf("level = 0: ancestorTNum = %d, teamSize = %d\n", ancestorTNum0, teamSize0); - // CHECK: level = 1: ancestorTNum = -1, teamSize = -1 - printf("level = 1: ancestorTNum = %d, teamSize = %d\n", ancestorTNum1, teamSize1); - - // CHECK-NOT: invalid - for (int i = 0; i < MaxThreads; i++) { - // Check active parallel region: - // omp_get_level() = 1, omp_get_active_level() = 1 - const int Expected1 = 6; - if (i < NumThreads) { - if (check1[i] != Expected1) { - printf("invalid: check1[%d] should be %d, is %d\n", i, Expected1, check1[i]); - } - } else if (check1[i] != 0) { - printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]); - } - - // 5 * 1 + 3 * 64 - 1 - 1 (see above) - const int Expected2 = 195; - if (i < NumThreads) { - if (check2[i] != Expected2) { - printf("invalid: check2[%d] should be %d, is %d\n", i, Expected2, check2[i]); - } - } else if (check2[i] != 0) { - printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]); - } - - // Check serialized parallel region: - // omp_get_level() = 2, omp_get_active_level() = 1 - const int Expected3 = 11; - if (i < NumThreads) { - if (check3[i] != Expected3) { - printf("invalid: check3[%d] should be %d, is %d\n", i, Expected3, check3[i]); - } - } else if (check3[i] != 0) { - printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]); - } - - // 5 * 1 + 3 * 64 + 3 * 1 - 1 - 1 (see above) - const int Expected4 = 198; - if (i < NumThreads) { - if (check4[i] != Expected4) { - printf("invalid: check4[%d] should be %d, is %d\n", i, Expected4, check4[i]); - } - } else if (check4[i] != 0) { - printf("invalid: check4[%d] should be 0, is %d\n", i, check4[i]); - } - } - - // Check for paraller level in non-SPMD kernels. - level = 0; - #pragma omp target teams distribute num_teams(1) thread_limit(32) reduction(+:level) - for (int i=0; i<5032; i+=32) { - int ub = (i+32 > 5032) ? 5032 : i+32; - #pragma omp parallel for schedule(dynamic) - for (int j=i ; j < ub; j++) ; - level += omp_get_level(); - } - // CHECK: Integral level = 0. - printf("Integral level = %d.\n", level); - - return 0; -} diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/nested.c b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/nested.c deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/nested.c +++ /dev/null @@ -1,136 +0,0 @@ -// RUN: %compile-run-and-check - -#include -#include - -const int MaxThreads = 1024; -const int NumThreads = 64; -const int NumThreads1 = 1; - -int main(int argc, char *argv[]) { - int inParallel = -1, numThreads = -1, threadNum = -1; - int check1[MaxThreads]; - int check2[MaxThreads]; - for (int i = 0; i < MaxThreads; i++) { - check1[i] = check2[i] = 0; - } - -#pragma omp target map(inParallel, numThreads, threadNum, check1[:], check2[:]) - { - inParallel = omp_in_parallel(); - numThreads = omp_get_num_threads(); - threadNum = omp_get_thread_num(); - -// Expecting active parallel region. -#pragma omp parallel num_threads(NumThreads) - { - int id = omp_get_thread_num(); - check1[id] += omp_get_num_threads() + omp_in_parallel(); - -// Expecting serialized parallel region. -#pragma omp parallel - { - // Expected to be 1. - int nestedInParallel = omp_in_parallel(); - // Expected to be 1. - int nestedNumThreads = omp_get_num_threads(); - // Expected to be 0. - int nestedThreadNum = omp_get_thread_num(); -#pragma omp atomic - check2[id] += nestedInParallel + nestedNumThreads + nestedThreadNum; - } - } - } - - // CHECK: target: inParallel = 0, numThreads = 1, threadNum = 0 - printf("target: inParallel = %d, numThreads = %d, threadNum = %d\n", - inParallel, numThreads, threadNum); - - // CHECK-NOT: invalid - for (int i = 0; i < MaxThreads; i++) { - // Check that all threads reported - // omp_get_num_threads() = 64, omp_in_parallel() = 1. - int Expected = NumThreads + 1; - if (i < NumThreads) { - if (check1[i] != Expected) { - printf("invalid: check1[%d] should be %d, is %d\n", i, Expected, - check1[i]); - } - } else if (check1[i] != 0) { - printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]); - } - - // Check serialized parallel region. - if (i < NumThreads) { - if (check2[i] != 2) { - printf("invalid: check2[%d] should be 2, is %d\n", i, check2[i]); - } - } else if (check2[i] != 0) { - printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]); - } - } - - inParallel = -1; - numThreads = -1; - threadNum = -1; - for (int i = 0; i < MaxThreads; i++) { - check1[i] = check2[i] = 0; - } - -#pragma omp target map(inParallel, numThreads, threadNum, check1[:], check2[:]) - { - inParallel = omp_in_parallel(); - numThreads = omp_get_num_threads(); - threadNum = omp_get_thread_num(); - -// Expecting active parallel region. -#pragma omp parallel num_threads(NumThreads1) - { - int id = omp_get_thread_num(); - check1[id] += omp_get_num_threads() + omp_in_parallel(); - -// Expecting serialized parallel region. -#pragma omp parallel - { - // Expected to be 0. - int nestedInParallel = omp_in_parallel(); - // Expected to be 1. - int nestedNumThreads = omp_get_num_threads(); - // Expected to be 0. - int nestedThreadNum = omp_get_thread_num(); -#pragma omp atomic - check2[id] += nestedInParallel + nestedNumThreads + nestedThreadNum; - } - } - } - - // CHECK: target: inParallel = 0, numThreads = 1, threadNum = 0 - printf("target: inParallel = %d, numThreads = %d, threadNum = %d\n", - inParallel, numThreads, threadNum); - - // CHECK-NOT: invalid - for (int i = 0; i < MaxThreads; i++) { - // Check that all threads reported - // omp_get_num_threads() = 1, omp_in_parallel() = 0. - int Expected = 1; - if (i < NumThreads1) { - if (check1[i] != Expected) { - printf("invalid: check1[%d] should be %d, is %d\n", i, Expected, - check1[i]); - } - } else if (check1[i] != 0) { - printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]); - } - - // Check serialized parallel region. - if (i < NumThreads1) { - if (check2[i] != 1) { - printf("invalid: check2[%d] should be 1, is %d\n", i, check2[i]); - } - } else if (check2[i] != 0) { - printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]); - } - } - - return 0; -} diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/num_threads.c b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/num_threads.c deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/num_threads.c +++ /dev/null @@ -1,102 +0,0 @@ -// RUN: %compile-run-and-check - -#include -#include - -const int WarpSize = 32; -const int NumThreads1 = 1 * WarpSize; -const int NumThreads2 = 2 * WarpSize; -const int NumThreads3 = 3 * WarpSize; -const int MaxThreads = 1024; - -int main(int argc, char *argv[]) { - int check1[MaxThreads]; - int check2[MaxThreads]; - int check3[MaxThreads]; - int check4[MaxThreads]; - for (int i = 0; i < MaxThreads; i++) { - check1[i] = check2[i] = check3[i] = check4[i] = 0; - } - - int maxThreads1 = -1; - int maxThreads2 = -1; - int maxThreads3 = -1; - - #pragma omp target map(check1[:], check2[:], check3[:], check4[:]) \ - map(maxThreads1, maxThreads2, maxThreads3) - { - #pragma omp parallel num_threads(NumThreads1) - { - check1[omp_get_thread_num()] += omp_get_num_threads(); - } - - // API method to set number of threads in parallel regions without - // num_threads() clause. - omp_set_num_threads(NumThreads2); - maxThreads1 = omp_get_max_threads(); - #pragma omp parallel - { - check2[omp_get_thread_num()] += omp_get_num_threads(); - } - - maxThreads2 = omp_get_max_threads(); - - // num_threads() clause should override nthreads-var ICV. - #pragma omp parallel num_threads(NumThreads3) - { - check3[omp_get_thread_num()] += omp_get_num_threads(); - } - - maxThreads3 = omp_get_max_threads(); - - // Effect from omp_set_num_threads() should still be visible. - #pragma omp parallel - { - check4[omp_get_thread_num()] += omp_get_num_threads(); - } - } - - // CHECK: maxThreads1 = 64 - printf("maxThreads1 = %d\n", maxThreads1); - // CHECK: maxThreads2 = 64 - printf("maxThreads2 = %d\n", maxThreads2); - // CHECK: maxThreads3 = 64 - printf("maxThreads3 = %d\n", maxThreads3); - - // CHECK-NOT: invalid - for (int i = 0; i < MaxThreads; i++) { - if (i < NumThreads1) { - if (check1[i] != NumThreads1) { - printf("invalid: check1[%d] should be %d, is %d\n", i, NumThreads1, check1[i]); - } - } else if (check1[i] != 0) { - printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]); - } - - if (i < NumThreads2) { - if (check2[i] != NumThreads2) { - printf("invalid: check2[%d] should be %d, is %d\n", i, NumThreads2, check2[i]); - } - } else if (check2[i] != 0) { - printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]); - } - - if (i < NumThreads3) { - if (check3[i] != NumThreads3) { - printf("invalid: check3[%d] should be %d, is %d\n", i, NumThreads3, check3[i]); - } - } else if (check3[i] != 0) { - printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]); - } - - if (i < NumThreads2) { - if (check4[i] != NumThreads2) { - printf("invalid: check4[%d] should be %d, is %d\n", i, NumThreads2, check4[i]); - } - } else if (check4[i] != 0) { - printf("invalid: check4[%d] should be 0, is %d\n", i, check4[i]); - } - } - - return 0; -} diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp +++ /dev/null @@ -1,51 +0,0 @@ -// RUN: %compilexx-run-and-check - -#include -#include - -int main(void) { - int isHost = -1; - int ParallelLevel1 = -1, ParallelLevel2 = -1; - int Count = 0; - -#pragma omp target parallel for map(tofrom \ - : isHost, ParallelLevel1, ParallelLevel2), reduction(+: Count) schedule(static, 1) - for (int J = 0; J < 10; ++J) { -#pragma omp critical - { - isHost = (isHost < 0 || isHost == 0) ? omp_is_initial_device() : isHost; - ParallelLevel1 = (ParallelLevel1 < 0 || ParallelLevel1 == 1) - ? omp_get_level() - : ParallelLevel1; - } - if (omp_get_thread_num() > 5) { - int L2; -#pragma omp parallel for schedule(dynamic) lastprivate(L2) reduction(+: Count) - for (int I = 0; I < 10; ++I) { - L2 = omp_get_level(); - Count += omp_get_level(); // (10-6)*10*2 = 80 - } -#pragma omp critical - ParallelLevel2 = - (ParallelLevel2 < 0 || ParallelLevel2 == 2) ? L2 : ParallelLevel2; - } else { - Count += omp_get_level(); // 6 * 1 = 6 - } - } - - if (isHost < 0) { - printf("Runtime error, isHost=%d\n", isHost); - } - - // CHECK: Target region executed on the device - printf("Target region executed on the %s\n", isHost ? "host" : "device"); - // CHECK: Parallel level in SPMD mode: L1 is 1, L2 is 2 - printf("Parallel level in SPMD mode: L1 is %d, L2 is %d\n", ParallelLevel1, - ParallelLevel2); - // Final result of Count is (10-6)(num of loops)*10(num of iterations)*2(par - // level) + 6(num of iterations) * 1(par level) - // CHECK: Expected count = 86 - printf("Expected count = %d\n", Count); - - return isHost; -} diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/thread_limit.c b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/thread_limit.c deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/thread_limit.c +++ /dev/null @@ -1,77 +0,0 @@ -// RUN: %compile-run-and-check - -#include -#include - -const int WarpSize = 32; -const int ThreadLimit = 1 * WarpSize; -const int NumThreads2 = 2 * WarpSize; -const int NumThreads3 = 3 * WarpSize; -const int MaxThreads = 1024; - -int main(int argc, char *argv[]) { - int check1[MaxThreads]; - int check2[MaxThreads]; - int check3[MaxThreads]; - for (int i = 0; i < MaxThreads; i++) { - check1[i] = check2[i] = check3[i] = 0; - } - - int threadLimit = -1; - - #pragma omp target teams num_teams(1) thread_limit(ThreadLimit) \ - map(check1[:], check2[:], check3[:], threadLimit) - { - threadLimit = omp_get_thread_limit(); - - // All parallel regions should get as many threads as specified by the - // thread_limit() clause. - #pragma omp parallel - { - check1[omp_get_thread_num()] += omp_get_num_threads(); - } - - omp_set_num_threads(NumThreads2); - #pragma omp parallel - { - check2[omp_get_thread_num()] += omp_get_num_threads(); - } - - #pragma omp parallel num_threads(NumThreads3) - { - check3[omp_get_thread_num()] += omp_get_num_threads(); - } - } - - // CHECK: threadLimit = 32 - printf("threadLimit = %d\n", threadLimit); - - // CHECK-NOT: invalid - for (int i = 0; i < MaxThreads; i++) { - if (i < ThreadLimit) { - if (check1[i] != ThreadLimit) { - printf("invalid: check1[%d] should be %d, is %d\n", i, ThreadLimit, check1[i]); - } - } else if (check1[i] != 0) { - printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]); - } - - if (i < ThreadLimit) { - if (check2[i] != ThreadLimit) { - printf("invalid: check2[%d] should be %d, is %d\n", i, ThreadLimit, check2[i]); - } - } else if (check2[i] != 0) { - printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]); - } - - if (i < ThreadLimit) { - if (check3[i] != ThreadLimit) { - printf("invalid: check3[%d] should be %d, is %d\n", i, ThreadLimit, check3[i]); - } - } else if (check3[i] != 0) { - printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]); - } - } - - return 0; -} diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/tripcount.c b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/tripcount.c deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/tripcount.c +++ /dev/null @@ -1,22 +0,0 @@ -// RUN: %compile-run-and-check - -#include -#include - -int main() { - int res = 0; - -#pragma omp parallel num_threads(2) reduction(+:res) - { - int tid = omp_get_thread_num(); -#pragma omp target teams distribute reduction(+:res) - for (int i = tid; i < 2; i++) - ++res; - } - // The first thread makes 2 iterations, the second - 1. Expected result of the - // reduction res is 3. - - // CHECK: res = 3. - printf("res = %d.\n", res); - return 0; -} diff --git a/openmp/libomptarget/deviceRTLs/target_interface.h b/openmp/libomptarget/deviceRTLs/target_interface.h deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/target_interface.h +++ /dev/null @@ -1,78 +0,0 @@ -//===------------- target_interface.h - Target interfaces --------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains interfaces that must be implemented by each target. -// -//===----------------------------------------------------------------------===// - -#ifndef _OMPTARGET_TARGET_INTERFACE_H_ -#define _OMPTARGET_TARGET_INTERFACE_H_ - -#include - -#include "DeviceEnvironment.h" -#include "target_impl.h" - -// Calls to the NVPTX layer (assuming 1D layout) -EXTERN int __kmpc_get_hardware_thread_id_in_block(); -EXTERN int GetBlockIdInKernel(); -EXTERN NOINLINE int __kmpc_get_hardware_num_blocks(); -EXTERN NOINLINE int __kmpc_get_hardware_num_threads_in_block(); -EXTERN unsigned __kmpc_get_warp_size(); -EXTERN unsigned GetWarpId(); -EXTERN unsigned GetLaneId(); - -// Atomics -uint32_t __kmpc_atomic_add(uint32_t *, uint32_t); -uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t); -uint32_t __kmpc_atomic_max(uint32_t *, uint32_t); -uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t); -uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t); -static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); -unsigned long long __kmpc_atomic_exchange(unsigned long long *, - unsigned long long); -unsigned long long __kmpc_atomic_add(unsigned long long *, unsigned long long); - -// Locks -EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock); -EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock); -EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock); -EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock); -EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock); - -EXTERN void __kmpc_impl_threadfence(); -EXTERN void __kmpc_impl_threadfence_block(); -EXTERN void __kmpc_impl_threadfence_system(); - -EXTERN double __kmpc_impl_get_wtick(); -EXTERN double __kmpc_impl_get_wtime(); - -EXTERN void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi); -EXTERN uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi); -EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt(); -EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt(); -EXTERN uint32_t __kmpc_impl_smid(); - -EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask(); - -EXTERN void __kmpc_impl_syncthreads(); -EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask); - -// Kernel initialization -EXTERN void __kmpc_impl_target_init(); - -// Memory -EXTERN void *__kmpc_impl_malloc(size_t); -EXTERN void __kmpc_impl_free(void *); - -// Barrier until num_threads arrive. -EXTERN void __kmpc_impl_named_sync(uint32_t num_threads); - -extern DeviceEnvironmentTy omptarget_device_environment; - -#endif // _OMPTARGET_TARGET_INTERFACE_H_ diff --git a/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt b/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt --- a/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt +++ b/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt @@ -118,6 +118,6 @@ libomptarget_say("Not generating amdgcn test targets as amdgpu-arch exited with ${amdgpu_arch_result}") else() # Report to the parent scope that we are building a plugin for amdgpu - set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} amdgcn-amd-amdhsa-newRTL " PARENT_SCOPE) + set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} amdgcn-amd-amdhsa " PARENT_SCOPE) endif() diff --git a/openmp/libomptarget/plugins/cuda/CMakeLists.txt b/openmp/libomptarget/plugins/cuda/CMakeLists.txt --- a/openmp/libomptarget/plugins/cuda/CMakeLists.txt +++ b/openmp/libomptarget/plugins/cuda/CMakeLists.txt @@ -72,7 +72,7 @@ # Otherwise this plugin is being built speculatively and there may be no cuda available if (LIBOMPTARGET_CAN_LINK_LIBCUDA OR LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA) libomptarget_say("Enable tests using CUDA plugin") - set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda-newRTL nvptx64-nvidia-cuda-newDriver" PARENT_SCOPE) + set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda nvptx64-nvidia-cuda-newDriver" PARENT_SCOPE) else() libomptarget_say("Disabling tests using CUDA plugin as cuda may not be available") endif() diff --git a/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c b/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c --- a/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c +++ b/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c @@ -1,4 +1,4 @@ -// RUN: %libomptarget-compile-nvptx64-nvidia-cuda -fopenmp-target-new-runtime +// RUN: %libomptarget-compile-nvptx64-nvidia-cuda // RUN: env LIBOMPTARGET_SHARED_MEMORY_SIZE=256 \ // RUN: %libomptarget-run-nvptx64-nvidia-cuda | %fcheck-nvptx64-nvidia-cuda // REQUIRES: nvptx64-nvidia-cuda diff --git a/openmp/libomptarget/test/lit.cfg b/openmp/libomptarget/test/lit.cfg --- a/openmp/libomptarget/test/lit.cfg +++ b/openmp/libomptarget/test/lit.cfg @@ -104,17 +104,11 @@ config.test_flags += " --libomptarget-amdgcn-bc-path=" + config.library_dir if config.libomptarget_current_target.startswith('nvptx'): config.test_flags += " --libomptarget-nvptx-bc-path=" + config.library_dir - if config.libomptarget_current_target.endswith('-newRTL'): - config.test_flags += " -fopenmp-target-new-runtime" - elif not config.libomptarget_current_target.endswith('-newDriver'): - config.test_flags += " -fno-openmp-target-new-runtime" if config.libomptarget_current_target.endswith('-newDriver'): config.test_flags += " -fopenmp-new-driver" -def remove_newRTL_suffix_if_present(name): - if name.endswith('-newRTL'): - return name[:-7] - elif name.endswith('-newDriver'): +def remove_suffix_if_present(name): + if name.endswith('-newDriver'): return name[:-10] else: return name @@ -183,10 +177,10 @@ "%not --crash %t")) config.substitutions.append(("%clangxx-" + libomptarget_target, \ "%clangxx %openmp_flags %cuda_flags %flags -fopenmp-targets=" +\ - remove_newRTL_suffix_if_present(libomptarget_target))) + remove_suffix_if_present(libomptarget_target))) config.substitutions.append(("%clang-" + libomptarget_target, \ "%clang %openmp_flags %cuda_flags %flags -fopenmp-targets=" +\ - remove_newRTL_suffix_if_present(libomptarget_target))) + remove_suffix_if_present(libomptarget_target))) config.substitutions.append(("%fcheck-" + libomptarget_target, \ config.libomptarget_filecheck + " %s")) else: diff --git a/openmp/libomptarget/test/mapping/data_member_ref.cpp b/openmp/libomptarget/test/mapping/data_member_ref.cpp --- a/openmp/libomptarget/test/mapping/data_member_ref.cpp +++ b/openmp/libomptarget/test/mapping/data_member_ref.cpp @@ -2,7 +2,7 @@ // Wrong results on amdgpu // XFAIL: amdgcn-amd-amdhsa -// XFAIL: amdgcn-amd-amdhsa-newRTL +// XFAIL: amdgcn-amd-amdhsa #include diff --git a/openmp/libomptarget/test/mapping/declare_mapper_nested_default_mappers.cpp b/openmp/libomptarget/test/mapping/declare_mapper_nested_default_mappers.cpp --- a/openmp/libomptarget/test/mapping/declare_mapper_nested_default_mappers.cpp +++ b/openmp/libomptarget/test/mapping/declare_mapper_nested_default_mappers.cpp @@ -2,7 +2,7 @@ // Wrong results on amdgpu // XFAIL: amdgcn-amd-amdhsa -// XFAIL: amdgcn-amd-amdhsa-newRTL +// XFAIL: amdgcn-amd-amdhsa #include #include diff --git a/openmp/libomptarget/test/mapping/declare_mapper_nested_mappers.cpp b/openmp/libomptarget/test/mapping/declare_mapper_nested_mappers.cpp --- a/openmp/libomptarget/test/mapping/declare_mapper_nested_mappers.cpp +++ b/openmp/libomptarget/test/mapping/declare_mapper_nested_mappers.cpp @@ -2,7 +2,7 @@ // Wrong results on amdgpu // XFAIL: amdgcn-amd-amdhsa -// XFAIL: amdgcn-amd-amdhsa-newRTL +// XFAIL: amdgcn-amd-amdhsa #include #include diff --git a/openmp/libomptarget/test/mapping/lambda_by_value.cpp b/openmp/libomptarget/test/mapping/lambda_by_value.cpp --- a/openmp/libomptarget/test/mapping/lambda_by_value.cpp +++ b/openmp/libomptarget/test/mapping/lambda_by_value.cpp @@ -2,7 +2,7 @@ // Wrong results on amdgpu // XFAIL: amdgcn-amd-amdhsa -// XFAIL: amdgcn-amd-amdhsa-newRTL +// XFAIL: amdgcn-amd-amdhsa #include #include diff --git a/openmp/libomptarget/test/mapping/lambda_mapping.cpp b/openmp/libomptarget/test/mapping/lambda_mapping.cpp --- a/openmp/libomptarget/test/mapping/lambda_mapping.cpp +++ b/openmp/libomptarget/test/mapping/lambda_mapping.cpp @@ -1,7 +1,7 @@ // RUN: %libomptarget-compilexx-run-and-check-generic // Error on the gpu that crashes the host -// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL +// UNSUPPORTED: amdgcn-amd-amdhsa #include diff --git a/openmp/libomptarget/test/mapping/ompx_hold/struct.c b/openmp/libomptarget/test/mapping/ompx_hold/struct.c --- a/openmp/libomptarget/test/mapping/ompx_hold/struct.c +++ b/openmp/libomptarget/test/mapping/ompx_hold/struct.c @@ -3,7 +3,7 @@ // Wrong results on amdgpu // XFAIL: amdgcn-amd-amdhsa -// XFAIL: amdgcn-amd-amdhsa-newRTL +// XFAIL: amdgcn-amd-amdhsa #include #include diff --git a/openmp/libomptarget/test/offloading/bug49021.cpp b/openmp/libomptarget/test/offloading/bug49021.cpp --- a/openmp/libomptarget/test/offloading/bug49021.cpp +++ b/openmp/libomptarget/test/offloading/bug49021.cpp @@ -2,7 +2,7 @@ // Hangs // UNSUPPORTED: amdgcn-amd-amdhsa -// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL +// UNSUPPORTED: amdgcn-amd-amdhsa // UNSUPPORTED: amdgcn-amd-amdhsa-newDriver #include diff --git a/openmp/libomptarget/test/offloading/bug49334.cpp b/openmp/libomptarget/test/offloading/bug49334.cpp --- a/openmp/libomptarget/test/offloading/bug49334.cpp +++ b/openmp/libomptarget/test/offloading/bug49334.cpp @@ -2,7 +2,7 @@ // Currently hangs on amdgpu // UNSUPPORTED: amdgcn-amd-amdhsa -// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL +// UNSUPPORTED: amdgcn-amd-amdhsa // UNSUPPORTED: x86_64-pc-linux-gnu #include diff --git a/openmp/libomptarget/test/offloading/bug51781.c b/openmp/libomptarget/test/offloading/bug51781.c --- a/openmp/libomptarget/test/offloading/bug51781.c +++ b/openmp/libomptarget/test/offloading/bug51781.c @@ -34,7 +34,7 @@ // Hangs // UNSUPPORTED: amdgcn-amd-amdhsa -// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL +// UNSUPPORTED: amdgcn-amd-amdhsa // UNSUPPORTED: amdgcn-amd-amdhsa-newDriver #if ADD_REDUCTION diff --git a/openmp/libomptarget/test/offloading/global_constructor.cpp b/openmp/libomptarget/test/offloading/global_constructor.cpp --- a/openmp/libomptarget/test/offloading/global_constructor.cpp +++ b/openmp/libomptarget/test/offloading/global_constructor.cpp @@ -2,7 +2,7 @@ // Fails in DAGToDAG on an address space problem // UNSUPPORTED: amdgcn-amd-amdhsa -// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL +// UNSUPPORTED: amdgcn-amd-amdhsa #include #include diff --git a/openmp/libomptarget/test/offloading/host_as_target.c b/openmp/libomptarget/test/offloading/host_as_target.c --- a/openmp/libomptarget/test/offloading/host_as_target.c +++ b/openmp/libomptarget/test/offloading/host_as_target.c @@ -9,7 +9,7 @@ // amdgpu does not have a working printf definition // XFAIL: amdgcn-amd-amdhsa -// XFAIL: amdgcn-amd-amdhsa-newRTL +// XFAIL: amdgcn-amd-amdhsa #include #include diff --git a/openmp/libomptarget/test/unified_shared_memory/api.c b/openmp/libomptarget/test/unified_shared_memory/api.c --- a/openmp/libomptarget/test/unified_shared_memory/api.c +++ b/openmp/libomptarget/test/unified_shared_memory/api.c @@ -1,11 +1,11 @@ // RUN: %libomptarget-compile-run-and-check-generic // XFAIL: nvptx64-nvidia-cuda -// XFAIL: nvptx64-nvidia-cuda-newRTL +// XFAIL: nvptx64-nvidia-cuda // XFAIL: nvptx64-nvidia-cuda-newDriver // Fails on amdgpu with error: GPU Memory Error // XFAIL: amdgcn-amd-amdhsa -// XFAIL: amdgcn-amd-amdhsa-newRTL +// XFAIL: amdgcn-amd-amdhsa // XFAIL: amdgcn-amd-amdhsa-newDriver #include diff --git a/openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c b/openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c --- a/openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c +++ b/openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c @@ -5,7 +5,7 @@ // Fails on amdgpu with error: GPU Memory Error // XFAIL: amdgcn-amd-amdhsa -// XFAIL: amdgcn-amd-amdhsa-newRTL +// XFAIL: amdgcn-amd-amdhsa #include #include diff --git a/openmp/libomptarget/test/unified_shared_memory/close_modifier.c b/openmp/libomptarget/test/unified_shared_memory/close_modifier.c --- a/openmp/libomptarget/test/unified_shared_memory/close_modifier.c +++ b/openmp/libomptarget/test/unified_shared_memory/close_modifier.c @@ -5,7 +5,7 @@ // amdgpu runtime crash // UNSUPPORTED: amdgcn-amd-amdhsa -// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL +// UNSUPPORTED: amdgcn-amd-amdhsa #include diff --git a/openmp/libomptarget/test/unified_shared_memory/shared_update.c b/openmp/libomptarget/test/unified_shared_memory/shared_update.c --- a/openmp/libomptarget/test/unified_shared_memory/shared_update.c +++ b/openmp/libomptarget/test/unified_shared_memory/shared_update.c @@ -4,7 +4,7 @@ // amdgpu runtime crash // UNSUPPORTED: amdgcn-amd-amdhsa -// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL +// UNSUPPORTED: amdgcn-amd-amdhsa #include #include