diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -19,6 +19,7 @@ #include "clang/AST/StmtVisitor.h" #include "clang/Basic/Cuda.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Frontend/OpenMP/OMPGridValues.h" #include "llvm/IR/IntrinsicsNVPTX.h" using namespace clang; @@ -196,11 +197,10 @@ /// code. For all practical purposes this is fine because the configuration /// is the same for all known NVPTX architectures. enum MachineConfiguration : unsigned { - WarpSize = 32, - /// Number of bits required to represent a lane identifier, which is - /// computed as log_2(WarpSize). + /// Number of bits required to represent a lane identifier + /// See "llvm/Frontend/OpenMP/OMPGridValues.h" for various related target + /// specific Grid Values like GV_Warp_Size LaneIDBits = 5, - LaneIDMask = WarpSize - 1, /// Global memory alignment for performance. GlobalMemoryAlignment = 128, @@ -436,6 +436,7 @@ EscapedDeclsForTeams = EscapedDecls.getArrayRef(); else EscapedDeclsForParallel = EscapedDecls.getArrayRef(); + unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size); GlobalizedRD = ::buildRecordForGlobalizedVars( CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams, MappedDeclsFields, WarpSize); @@ -624,6 +625,12 @@ /// Get the GPU warp size. static llvm::Value *getNVPTXWarpSize(CodeGenFunction &CGF) { + if (CGF.getTarget().getTriple().isAMDGCN()) { + CGBuilderTy &Bld = CGF.Builder; + // return constant compile-time target-specific warp size + unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size); + return Bld.getInt32(WarpSize); + } return CGF.EmitRuntimeCall( llvm::Intrinsic::getDeclaration( &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize), @@ -643,6 +650,8 @@ /// on the NVPTX device, to generate more efficient code. static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) { CGBuilderTy &Bld = CGF.Builder; + unsigned LaneIDBits = + CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size_Log2); return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id"); } @@ -651,6 +660,8 @@ /// on the NVPTX device, to generate more efficient code. static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) { CGBuilderTy &Bld = CGF.Builder; + unsigned LaneIDMask = CGF.getContext().getTargetInfo().getGridValue( + llvm::omp::GV_Warp_Size_Log2_Mask); return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask), "nvptx_lane_id"); } @@ -2073,6 +2084,7 @@ getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions); if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) { getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions); + unsigned WarpSize = CGM.getTarget().getGridValue(llvm::omp::GV_Warp_Size); if (!LastPrivatesReductions.empty()) { GlobalizedRD = ::buildRecordForGlobalizedVars( CGM.getContext(), llvm::None, LastPrivatesReductions, @@ -3243,6 +3255,7 @@ "__openmp_nvptx_data_transfer_temporary_storage"; llvm::GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName); + unsigned WarpSize = CGM.getTarget().getGridValue(llvm::omp::GV_Warp_Size); if (!TransferMedium) { auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize); unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);